Dataset ist in folgendem Verzeichniz D:\Office\Own Education\R\Übungen\Applied Regression Using Stata but in R!
Hierfür laden wir zunächst das nötige Packet, um read.csv
auszuführen. Im Anschluss können wir den Datensatz laden.
library(readr)
data1 <- read.csv("data1.csv", header=TRUE, sep=",")
write.table(data1, file="data1_HA.csv", sep=";") ## Dieser Datensatz wird von Excel falsch geöffnet
library(writexl)
write_xlsx(data1, path="data1_HA.xlsx") ## Schon besser!
data1$educ <- as.factor(data1$edu) ##Wir erstellen eine Variable, welche die Bildung als Factor beinhaltet
bild <- model.matrix(~data1$educ +0) ## Nun können wir eine Matrix (Dummies) erstellen (+0 kein Intercept)
bild <- as.data.frame(bild) ## Diese speichern wir als Dataframe ab
library(dplyr)
library(tibble)
data1 <- bind_cols(data1,bild) ## Und kombinieren beide Datensätze zurück zum Ausgangsdatensatz
data1$size_m2 <- round((data1$size/10.7639), digits=0) ## Gerundet auf 0-Kommastellen (digits=0)
### East-West Dummy kreiren
data1$state <- as.character(data1$state)
east <- c("Mecklenburg-V.", "Brandenburg", "Saxony-Anhalt", "Thueringen", "Saxony")
data2 <- data1 %>% mutate(state = ifelse(state %in% east, 1L, 0L))
data1$west_east <- data2$state
data1$west_east[data1$sample == "C (East Germany)"] <- 1L
### Rent per Square Meter
missings <- as.factor(c(".a",".b",".c", ".d"))
data1$rent[data1$rent %in% missings] <- NA
data1$rent <- as.numeric(data1$rent)
data1$rent_sqm <- data1$rent/data1$size_m2
### Average Rent per square meter in West Germany
mean(data1$rent_sqm[data1$west_east==0], na.rm = TRUE)
## [1] 6.243343
\(Alter = Jahr\ der\ Erhebung_{2009} - Jahr\ der \ Geburt_{ybirth}\)
data1$age <- 2009-data1$ybirth
data1$lives_alone <- data1$hhsize
data1$lives_alone[data1$lives_alone > 1] <- 0
summary(data1$lives_alone)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1678 0.0000 1.0000
data1$egp_cut <- as.character(data1$egp)
data1$egp_cut[grep("Service class", data1$egp_cut)] <- "Service Class"
data1$egp_cut[grep(" manual workers", data1$egp_cut)] <- "Manual Class"
data1$egp_cut[grep("non-manuals", data1$egp_cut)] <- "Non-Manual Class"
data1$egp_cut[data1$egp_cut=="Retired"] <- NA
data1$egp_cut[data1$egp_cut=="unemployed"] <- NA
data1$egp_cut[data1$egp_cut=="Does not apply"] <- NA
data1$egp_cut[data1$egp_cut=="Refusal"] <- NA
table(data1$egp_cut)
##
## Manual Class Non-Manual Class Self-Employed Service Class
## 1113 669 213 1093
In R labeln wir nicht. Jedoch können wir die hinterlegte Variable als Faktor-Variable speichern!
data1$egp_cut_fac <- as.factor(data1$egp_cut)
summary(data1$egp_cut_fac)
## Manual Class Non-Manual Class Self-Employed Service Class
## 1113 669 213 1093
## NA's
## 2323
str(data1$egp_cut_fac)
## Factor w/ 4 levels "Manual Class",..: NA NA NA 4 2 NA NA 2 1 4 ...
data1$inc_cut <- data1$income
data1$inc_cut[between(data1$income, 0, 10000)] <- "0 to 10.000"
data1$inc_cut[between(data1$income, 10001, 20000)] <- "10.001 to 20.000"
data1$inc_cut[between(data1$income, 20001, 50000)] <- "20.001 to 50.000"
data1$inc_cut[between(data1$income, 50001, 100000)] <- "50.001 to 100.000"
data1$inc_cut[data1$income > 100000] <- "Over 100.000"
data1$inc_cut <- as.factor(data1$inc_cut)
summary(data1$inc_cut)
## 0 to 10.000 10.001 to 20.000 20.001 to 50.000 50.001 to 100.000
## 2219 666 1489 357
## Over 100.000 NA's
## 48 632
### Saving the Data
write_xlsx(data1, path="data1_HA.xlsx")
### Saving the script via STRG + S, in code
Ist im Prinzip nicht nötig, weil ich innerhalb D:\Office\Own Education\R\Übungen\Applied Regression Using Stata but in R!
arbeite. Jedoch würde der Code wie folgt aussehen:
setwd("D:\\Office\\Own Education\\R\\Übungen\\Applied Regression Using Stata but in R!")
## Krieren wir zunächst ein Rowtotal!
wor <- select(data1, starts_with("wor"))
wor <- wor %>% mutate_all(as.character)
glimpse(wor)
## Rows: 5,411
## Columns: 12
## $ wor01 <chr> "Very concerned", "Very concerned", "Somewhat concerned", "Ve...
## $ wor02 <chr> "Somewhat concerned", "Somewhat concerned", "Somewhat concern...
## $ wor03 <chr> "Very concerned", "Very concerned", "Somewhat concerned", "So...
## $ wor04 <chr> "Very concerned", "Very concerned", "Very concerned", "Somewh...
## $ wor05 <chr> "Very concerned", "Very concerned", "Somewhat concerned", "So...
## $ wor06 <chr> "Very concerned", "Very concerned", "Somewhat concerned", "So...
## $ wor07 <chr> "Somewhat concerned", "Somewhat concerned", "Somewhat concern...
## $ wor08 <chr> "Very concerned", "Very concerned", "Somewhat concerned", "So...
## $ wor09 <chr> "Very concerned", "Very concerned", "Very concerned", "Somewh...
## $ wor10 <chr> "Not concerned at all", "Not concerned at all", "Somewhat con...
## $ wor11 <chr> "Very concerned", "Very concerned", "Somewhat concerned", "So...
## $ wor12 <chr> "Does not apply", "Does not apply", "Does not apply", "Not co...
wor[wor==""] <- NA
wor[wor=="Refusal"] <-NA
wor[wor=="Not concerned at all"] <- "0"
wor[wor=="Somewhat concerned"] <- "1"
wor[wor=="Very concerned"] <- "2"
wor[wor=="Does not apply"] <- NA
wor <- wor %>% mutate_all(as.numeric)
wor$addwor <- rowSums(wor[c(1:12)], na.rm = TRUE)
head(wor$addwor) ## Missings entwalten
## [1] 18 18 13 12 15 21
##Wir können die Missings ausschließen mit na.rm = FALSE
wor$addwor <- rowSums(wor[c(1:12)], na.rm = FALSE)
head(wor$addwor) ##addwor = NA if wor* == NA
## [1] NA NA NA 12 15 NA
##Aber jetzt nach ungefähren Logik wie es in der Aufgabenstelltung verlangt wird
wor$addwor <- rowSums(wor[c(1:12)], na.rm = TRUE) ## Variable in die Ausgangslange versetzt
wor$miss_wor <- complete.cases(wor[c(1:12)]) ## Indicator Variable, FALSE = Da sind Row-Missings
wor$addwor[wor$miss_wor == FALSE] <- NA
head(wor$addwor) ##addwor = NA if wor* == NA
## [1] NA NA NA 12 15 NA
##Wir fügen diese Variable in data1 ein
data1$addwor <- wor$addwor
data1$miss_wor <- wor$miss_wor
mean(data1$addwor, na.rm = TRUE)
## [1] 12.70903
Dies ist bereits geschehen als wir die Variablen umkodiert haben.
Frauen haben im durchschnitt eher Sorgen!
mean(data1$addwor[data1$sex=="Male"], na.rm = TRUE)
## [1] 12.32838
mean(data1$addwor[data1$sex=="Female"], na.rm = TRUE)
## [1] 13.10214
reg1 <- lm(addwor ~ sex, data=data1)
reg1
##
## Call:
## lm(formula = addwor ~ sex, data = data1)
##
## Coefficients:
## (Intercept) sexMale
## 13.1021 -0.7738
Die Refernzkategorie stellen Frauen dar. Heißt der Y-Achsenabschnitt meint den durchschnitt vont \(addwor\) für Frauen. Der Regressionskoeffizient \(\beta_{sex}\) bedeutet, dass Männer um 0,7738 Punkte des Indikators weniger Angst haben. Ausgerechnet heißt das:
13.12021 - 0.7738
## [1] 12.34641
Beide Ergebnisse entsprechen in etwa dem Durchschnitt. Eine Einfach-Regression entspricht immer dem Durchschnitt der abhängigen Variable auf den Ebenen der Merkmale der unabhängigen Variable.
##Wir zentrieren und kodieren yedu und age um!
data1$yedu[data1$yedu %in% missings] <- NA
data1$yedu <- as.numeric(data1$yedu)
mean_yedu <- mean(data1$yedu, na.rm = TRUE)
data1$yedu_centr <- data1$yedu - mean_yedu
mean_age <- mean(data1$age, na.rm = TRUE)
data1$age_centr <- data1$age - mean_age
reg2 <- lm(addwor ~ sex+yedu_centr+age_centr, data=data1)
reg2
##
## Call:
## lm(formula = addwor ~ sex + yedu_centr + age_centr, data = data1)
##
## Coefficients:
## (Intercept) sexMale yedu_centr age_centr
## 13.36057 -0.81080 -0.16427 0.03728
Der Y-Achenabschnitt (Konstante) enstpricht nun den durchschnittlichen Sorgen von Frauen mit durchschnittlichen Bildungsjahren (ca. 6 Jahre) und durchschnittlichen Alter (ca. 50 Jahre)
Der Koeffizient für Bildungsjahre ist stärker. Wir standardisieren die Koeffizienten um sie in Standartabweichungen anzugeben. So können wir die Koeffizienten vergleichen!
library(lm.beta)
reg2_z <- lm.beta(reg2)
reg2_z
##
## Call:
## lm(formula = addwor ~ sex + yedu_centr + age_centr, data = data1)
##
## Standardized Coefficients::
## (Intercept) sexMale yedu_centr age_centr
## 0.00000000 -0.08928953 -0.09949876 0.09745538
Der Z-Standartisierte Koeffizient für Bildungsjahre ist (immernoch) stärker im Betrag als das Alter!
Unterscheiden sie sich: JA! Aber auf einer Punkte Skala die minimum 0 beträgt und Maximum 24, ist das für eine Binär-Variable an Oomph relativ wenig! Auf dem signifikanznivue \(p<0,05\) ist der Effekt signifkant.
summary(reg2)
##
## Call:
## lm(formula = addwor ~ sex + yedu_centr + age_centr, data = data1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.5995 -2.7491 -0.0257 2.8679 12.5319
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.36057 0.12615 105.914 < 2e-16 ***
## sexMale -0.81080 0.16646 -4.871 1.17e-06 ***
## yedu_centr -0.16427 0.03027 -5.427 6.19e-08 ***
## age_centr 0.03728 0.00701 5.318 1.13e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.482 on 2900 degrees of freedom
## (2507 observations deleted due to missingness)
## Multiple R-squared: 0.02638, Adjusted R-squared: 0.02537
## F-statistic: 26.19 on 3 and 2900 DF, p-value: < 2.2e-16
##Prüfen wir auf Missings usw.
table(data1$lsat)
##
## 1 2 3
## 27 68 167
## 4 6 7
## 249 662 1202
## 8 9 Completely dissatisfied
## 1533 467 13
## Completely satisfied Intermediate Refusal
## 212 723 88
##Variable sollte eine 10-Stufige Kategorie sein
data1$lsat <- as.character(data1$lsat)
data1$lsat[data1$lsat == "Refusal"] <- NA
data1$lsat[data1$lsat == "Completely dissatisfied"] <- 0
data1$lsat[data1$lsat == "Completely satisfied"] <- 10
data1$lsat[data1$lsat == "Intermediate"] <- 5
data1$lsat <- as.numeric(data1$lsat)
glimpse(data1$lsat) ## Nun numerisch
## num [1:5411] 8 8 8 8 7 6 6 5 5 8 ...
table(data1$lsat)
##
## 0 1 2 3 4 5 6 7 8 9 10
## 13 27 68 167 249 723 662 1202 1533 467 212
##HHINC sollte ok sein!
summary(data1$hhinc)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 583 21143 32461 37150 46687 507369 4
glimpse(data1$hhinc)
## int [1:5411] 22093 22093 62078 62078 24578 33401 33401 30237 30237 43197 ...
##Regression
reg_lsat <- lm(lsat ~ hhinc, data=data1)
summary(reg_lsat)
##
## Call:
## lm(formula = lsat ~ hhinc, data = data1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.8854 -0.9980 0.2779 1.2402 3.5297
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.442e+00 4.121e-02 156.30 <2e-16 ***
## hhinc 9.934e-06 9.015e-07 11.02 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.761 on 5317 degrees of freedom
## (92 observations deleted due to missingness)
## Multiple R-squared: 0.02233, Adjusted R-squared: 0.02214
## F-statistic: 121.4 on 1 and 5317 DF, p-value: < 2.2e-16
dim(reg_lsat$model) ##5319
## [1] 5319 2
reg_lsat[["coefficients"]][["hhinc"]] ## oder
## [1] 9.934276e-06
reg_lsat$coefficients[["hhinc"]] ## oder
## [1] 9.934276e-06
getElement(reg_lsat$coefficients, "hhinc")
## [1] 9.934276e-06
sk_lsat <- reg_lsat$coefficients[["(Intercept)"]] + (6000*reg_lsat$coefficients[["hhinc"]])
sk_lsat
## [1] 6.501492
## Tendenziell eher glücklick
twk_lsat <- reg_lsat$coefficients[["(Intercept)"]] + (12000*reg_lsat$coefficients[["hhinc"]])
twk_lsat
## [1] 6.561098
## Tendenziell eher glücklick
twk_lsat - sk_lsat
## [1] 0.05960565
Als grobe Regel vergegewärtigen wir uns folgendes: Wir wollen alle jene Variablen in das Modell hinzunehmen, welche die X als auch Y Variable beanstanden und alle jene ausschließen, die von der X Variable beanstandet werden. Hier können sinnvollerweise Bildung, Beruf (oder - sgruppe/-klasse), Geschlecht, Migrationshintergrund (falls vorhanden), Wohnort und Alter mitbetrachtet werden.
con_lsat <-lm(lsat ~ hhinc+yedu_centr+egp_cut+sex+west_east+age^2+age, data=data1)
summary(con_lsat)
##
## Call:
## lm(formula = lsat ~ hhinc + yedu_centr + egp_cut + sex + west_east +
## age^2 + age, data = data1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.6224 -0.9943 0.2554 1.1162 3.8544
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.287e+00 1.346e-01 54.156 < 2e-16 ***
## hhinc 6.284e-06 1.136e-06 5.534 3.41e-08 ***
## yedu_centr 1.544e-02 1.157e-02 1.334 0.182276
## egp_cutNon-Manual Class 7.528e-02 8.703e-02 0.865 0.387135
## egp_cutSelf-Employed 1.602e-02 1.278e-01 0.125 0.900260
## egp_cutService Class 2.570e-01 7.633e-02 3.367 0.000769 ***
## sexMale -5.934e-02 6.412e-02 -0.925 0.354844
## west_east -4.131e-01 6.882e-02 -6.003 2.17e-09 ***
## age -1.357e-02 2.622e-03 -5.176 2.42e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.65 on 2935 degrees of freedom
## (2467 observations deleted due to missingness)
## Multiple R-squared: 0.04185, Adjusted R-squared: 0.03924
## F-statistic: 16.03 on 8 and 2935 DF, p-value: < 2.2e-16
##root-mean-squared-error
library(modelr)
rmse(con_lsat, data1)
## [1] 1.647165
Wenn wir die Kausalmechanismen eine X Variable ergründen wollen, wollen wir jene Faktoren benennen, welche den Effekt von X auf Y ausmachen. Zum Beispiel: Haushälter unterschiedlichen Einkommens haben unterschiedliche Sorgen, welche sich wiederum auf die Lebenszufriedenheit auswirken können. Außerdem können sich Haushälter mit verschiedenem Einkommen entsprechend verschieden großen Wohnraum leisten. Die Größe des Wohnraums könnten wiederum Einfluss auf die Lebenszufriedenheit haben.
mediation1_lsat <-lm(lsat ~ hhinc+yedu_centr+egp_cut+sex+west_east+age^2+age+size_m2, data=data1)
summary(mediation1_lsat)
##
## Call:
## lm(formula = lsat ~ hhinc + yedu_centr + egp_cut + sex + west_east +
## age^2 + age + size_m2, data = data1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.6570 -1.0013 0.2617 1.1223 3.8671
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.188e+00 1.454e-01 49.426 < 2e-16 ***
## hhinc 5.317e-06 1.257e-06 4.230 2.41e-05 ***
## yedu_centr 1.696e-02 1.160e-02 1.462 0.14378
## egp_cutNon-Manual Class 6.851e-02 8.708e-02 0.787 0.43151
## egp_cutSelf-Employed 1.756e-03 1.280e-01 0.014 0.98906
## egp_cutService Class 2.523e-01 7.635e-02 3.305 0.00096 ***
## sexMale -6.413e-02 6.415e-02 -1.000 0.31758
## west_east -4.010e-01 6.913e-02 -5.800 7.32e-09 ***
## age -1.384e-02 2.625e-03 -5.272 1.45e-07 ***
## size_m2 1.443e-03 8.061e-04 1.790 0.07361 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.649 on 2934 degrees of freedom
## (2467 observations deleted due to missingness)
## Multiple R-squared: 0.0429, Adjusted R-squared: 0.03996
## F-statistic: 14.61 on 9 and 2934 DF, p-value: < 2.2e-16
mediation_lsat <-lm(lsat ~ hhinc+yedu_centr+egp_cut+sex+west_east+age^2+age+size_m2+addwor, data=data1)
summary(mediation_lsat)
##
## Call:
## lm(formula = lsat ~ hhinc + yedu_centr + egp_cut + sex + west_east +
## age^2 + age + size_m2 + addwor, data = data1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.8283 -0.9217 0.2769 1.0639 4.0518
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.098e+00 1.687e-01 47.999 < 2e-16 ***
## hhinc 4.161e-06 1.244e-06 3.346 0.000831 ***
## yedu_centr 1.804e-02 1.172e-02 1.539 0.123865
## egp_cutNon-Manual Class 2.601e-02 8.678e-02 0.300 0.764456
## egp_cutSelf-Employed -1.133e-01 1.325e-01 -0.855 0.392434
## egp_cutService Class 1.334e-01 7.660e-02 1.742 0.081692 .
## sexMale -1.214e-01 6.438e-02 -1.885 0.059494 .
## west_east -3.706e-01 6.929e-02 -5.349 9.54e-08 ***
## age -1.062e-02 2.715e-03 -3.913 9.35e-05 ***
## size_m2 1.482e-03 8.035e-04 1.844 0.065250 .
## addwor -7.218e-02 6.975e-03 -10.348 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.602 on 2766 degrees of freedom
## (2634 observations deleted due to missingness)
## Multiple R-squared: 0.07682, Adjusted R-squared: 0.07349
## F-statistic: 23.02 on 10 and 2766 DF, p-value: < 2.2e-16
##Differenz
format(con_lsat$coefficients[["hhinc"]] - mediation_lsat$coefficients[["hhinc"]], scientific = FALSE)
## [1] "0.000002123355"
##Vergleich 1
library(sjPlot)
## #refugeeswelcome
tab_model(reg_lsat, con_lsat, mediation1_lsat, mediation_lsat)
lsat | lsat | lsat | lsat | |||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|
Predictors | Estimates | CI | p | Estimates | CI | p | Estimates | CI | p | Estimates | CI | p |
(Intercept) | 6.44 | 6.36 – 6.52 | <0.001 | 7.29 | 7.02 – 7.55 | <0.001 | 7.19 | 6.90 – 7.47 | <0.001 | 8.10 | 7.77 – 8.43 | <0.001 |
hhinc | 0.00 | 0.00 – 0.00 | <0.001 | 0.00 | 0.00 – 0.00 | <0.001 | 0.00 | 0.00 – 0.00 | <0.001 | 0.00 | 0.00 – 0.00 | 0.001 |
yedu_centr | 0.02 | -0.01 – 0.04 | 0.182 | 0.02 | -0.01 – 0.04 | 0.144 | 0.02 | -0.00 – 0.04 | 0.124 | |||
egp_cut [Non-Manual Class] |
0.08 | -0.10 – 0.25 | 0.387 | 0.07 | -0.10 – 0.24 | 0.432 | 0.03 | -0.14 – 0.20 | 0.764 | |||
egp_cut [Self-Employed] | 0.02 | -0.23 – 0.27 | 0.900 | 0.00 | -0.25 – 0.25 | 0.989 | -0.11 | -0.37 – 0.15 | 0.392 | |||
egp_cut [Service Class] | 0.26 | 0.11 – 0.41 | 0.001 | 0.25 | 0.10 – 0.40 | 0.001 | 0.13 | -0.02 – 0.28 | 0.082 | |||
sex [Male] | -0.06 | -0.19 – 0.07 | 0.355 | -0.06 | -0.19 – 0.06 | 0.318 | -0.12 | -0.25 – 0.00 | 0.059 | |||
west_east | -0.41 | -0.55 – -0.28 | <0.001 | -0.40 | -0.54 – -0.27 | <0.001 | -0.37 | -0.51 – -0.23 | <0.001 | |||
age | -0.01 | -0.02 – -0.01 | <0.001 | -0.01 | -0.02 – -0.01 | <0.001 | -0.01 | -0.02 – -0.01 | <0.001 | |||
size_m2 | 0.00 | -0.00 – 0.00 | 0.074 | 0.00 | -0.00 – 0.00 | 0.065 | ||||||
addwor | -0.07 | -0.09 – -0.06 | <0.001 | |||||||||
Observations | 5319 | 2944 | 2944 | 2777 | ||||||||
R2 / R2 adjusted | 0.022 / 0.022 | 0.042 / 0.039 | 0.043 / 0.040 | 0.077 / 0.073 |
##Vergleich 2
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
# Als Text
## stargazer(reg_lsat, con_lsat, mediation1_lsat, mediation_lsat, type = "text")
# Als Latex
## stargazer(reg_lsat, con_lsat, mediation1_lsat, mediation_lsat, type = "latex")
# Als HTML
## stargazer(reg_lsat, con_lsat, mediation1_lsat, mediation_lsat, type = "html")
Das Ergebnis des HTML Codes würde wie folgt aussehen:
Dependent variable: | ||||
lsat | ||||
(1) | (2) | (3) | (4) | |
hhinc | 0.00001*** | 0.00001*** | 0.00001*** | 0.00000*** |
(0.00000) | (0.00000) | (0.00000) | (0.00000) | |
yedu_centr | 0.015 | 0.017 | 0.018 | |
(0.012) | (0.012) | (0.012) | ||
egp_cutNon-Manual Class | 0.075 | 0.069 | 0.026 | |
(0.087) | (0.087) | (0.087) | ||
egp_cutSelf-Employed | 0.016 | 0.002 | -0.113 | |
(0.128) | (0.128) | (0.132) | ||
egp_cutService Class | 0.257*** | 0.252*** | 0.133* | |
(0.076) | (0.076) | (0.077) | ||
sexMale | -0.059 | -0.064 | -0.121* | |
(0.064) | (0.064) | (0.064) | ||
west_east | -0.413*** | -0.401*** | -0.371*** | |
(0.069) | (0.069) | (0.069) | ||
age | -0.014*** | -0.014*** | -0.011*** | |
(0.003) | (0.003) | (0.003) | ||
size_m2 | 0.001* | 0.001* | ||
(0.001) | (0.001) | |||
addwor | -0.072*** | |||
(0.007) | ||||
Constant | 6.442*** | 7.287*** | 7.188*** | 8.098*** |
(0.041) | (0.135) | (0.145) | (0.169) | |
Observations | 5,319 | 2,944 | 2,944 | 2,777 |
R2 | 0.022 | 0.042 | 0.043 | 0.077 |
Adjusted R2 | 0.022 | 0.039 | 0.040 | 0.073 |
Residual Std. Error | 1.761 (df = 5317) | 1.650 (df = 2935) | 1.649 (df = 2934) | 1.602 (df = 2766) |
F Statistic | 121.429*** (df = 1; 5317) | 16.026*** (df = 8; 2935) | 14.612*** (df = 9; 2934) | 23.018*** (df = 10; 2766) |
Note: | *p<0.1;**p<0.05;***p<0.01 |
##Erst X testen:
summary(data1$emp)
## full time irregular not employed part time
## 155 2041 288 2328 599
data1$emp_cut <- as.character(data1$emp)
data1$emp_cut[data1$emp_cut ==""] <-NA
data1$emp_cut[data1$emp_cut =="not employed"] <-NA
table(data1$emp_cut)
##
## full time irregular part time
## 2041 288 599
ha4 <- lm(income ~ emp_cut+age_centr+sex+age_centr*sex, data=data1)
summary(ha4)
##
## Call:
## lm(formula = income ~ emp_cut + age_centr + sex + age_centr *
## sex, data = data1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -52659 -12416 -3330 5051 856115
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 32364.3 1536.1 21.069 < 2e-16 ***
## emp_cutirregular -29032.4 2582.8 -11.241 < 2e-16 ***
## emp_cutpart time -16262.4 2051.2 -7.928 3.14e-15 ***
## age_centr 194.2 91.5 2.123 0.03388 *
## sexMale 14433.8 1826.9 7.901 3.90e-15 ***
## age_centr:sexMale 412.0 127.7 3.227 0.00126 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 38950 on 2909 degrees of freedom
## (2496 observations deleted due to missingness)
## Multiple R-squared: 0.1239, Adjusted R-squared: 0.1224
## F-statistic: 82.29 on 5 and 2909 DF, p-value: < 2.2e-16
ha4$coefficients[["(Intercept)"]]+ha4$coefficients[["age_centr"]]*1
## [1] 32558.46
ha4_2 <- lm(income ~ relevel(as.factor(emp_cut), ref="part time")+age_centr+sex+age_centr*sex, data=data1)
summary(ha4_2)
##
## Call:
## lm(formula = income ~ relevel(as.factor(emp_cut), ref = "part time") +
## age_centr + sex + age_centr * sex, data = data1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -52659 -12416 -3330 5051 856115
##
## Coefficients:
## Estimate Std. Error
## (Intercept) 16101.8 1651.0
## relevel(as.factor(emp_cut), ref = "part time")full time 16262.4 2051.2
## relevel(as.factor(emp_cut), ref = "part time")irregular -12770.0 2843.8
## age_centr 194.2 91.5
## sexMale 14433.8 1826.9
## age_centr:sexMale 412.0 127.7
## t value Pr(>|t|)
## (Intercept) 9.753 < 2e-16 ***
## relevel(as.factor(emp_cut), ref = "part time")full time 7.928 3.14e-15 ***
## relevel(as.factor(emp_cut), ref = "part time")irregular -4.490 7.38e-06 ***
## age_centr 2.123 0.03388 *
## sexMale 7.901 3.90e-15 ***
## age_centr:sexMale 3.227 0.00126 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 38950 on 2909 degrees of freedom
## (2496 observations deleted due to missingness)
## Multiple R-squared: 0.1239, Adjusted R-squared: 0.1224
## F-statistic: 82.29 on 5 and 2909 DF, p-value: < 2.2e-16
## Predicted Values for real values with "marings"
library(margins)
invisible(age_centr <- cplot(ha4, "age_centr", draw = FALSE))
library(ggplot2)
plottig_1 <- ggplot(age_centr, aes(y=yvals, x = xvals)) +
geom_line() +
geom_line(aes(y = upper), linetype = 2) +
geom_line(aes(y = lower), linetype = 2)
plottig_1
plottig_2 <- ggplot(age_centr, aes(y=yvals, x = xvals)) +
geom_line() +
geom_point() +
geom_errorbar(aes(ymin=lower, ymax=upper))
plottig_2
## Predicted Values for set range of values with "marings"
age_centr_2 <- cplot(ha4, "age_centr", draw = FALSE, xvals = seq(from = -40, to =40, by=10))
plottig_3 <- ggplot(age_centr_2, aes(y=yvals, x = xvals)) +
geom_line() +
geom_line(aes(y = upper), linetype = 2) +
geom_line(aes(y = lower), linetype = 2)
plottig_3
invisible(plottig_4 <- ggplot(age_centr_2, aes(y=yvals, x = xvals)) +
geom_line() +
geom_point() +
geom_errorbar(aes(ymin=lower, ymax=upper),width = 4)+
xlim(-45, 45) +
scale_x_continuous(breaks=seq(-40,40,10)))
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
plottig_4
## By-sex
age_centr_Male <- cplot(ha4, "age_centr", draw = FALSE, xvals = seq(from = -40, to =40, by=10), data = data1[data1$sex=="Male",])
age_centr_Male$sex <- "Male"
age_centr_Female <- cplot(ha4, "age_centr", draw = FALSE, xvals = seq(from = -40, to =40, by=10), data = data1[data1$sex=="Female",])
age_centr_Female$sex <- "Female"
age_centr_3 <- bind_rows(age_centr_Male,age_centr_Female)
## Farbig
plottig_5 <- ggplot(age_centr_3, aes(y=yvals, x = xvals, colour=factor(sex))) +
geom_line() +
geom_point() +
geom_errorbar(aes(ymin=lower, ymax=upper),width = 4)+
xlim(-45, 45) +
scale_x_continuous(breaks=seq(-40,40,10)) +
ylab("Predicted Income") +
xlab("Alter (zentriert)") +
labs(title = "Predicted Values mit 95%-KI")
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
plottig_5
## Grau-Töne
plottig_6 <- ggplot(age_centr_3, aes(y=yvals, x = xvals, colour=factor(sex))) +
geom_line() +
geom_point() +
geom_errorbar(aes(ymin=lower, ymax=upper),width = 4)+
xlim(-45, 45) +
scale_x_continuous(breaks=seq(-40,40,10)) +
ylab("Predicted Income") +
xlab("Alter (zentriert)") +
labs(title = "Predicted Values mit 95%-KI") +
scale_colour_grey()
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
plottig_6
## Grau-Töne mit unterschiedlichen Punkt-Schätzern
plottig_7 <- ggplot(age_centr_3, aes(y=yvals, x = xvals, colour=factor(sex))) +
geom_line() +
geom_point(aes(shape = factor(sex))) +
geom_errorbar(aes(ymin=lower, ymax=upper),width = 4)+
xlim(-45, 45) +
scale_x_continuous(breaks=seq(-40,40,10)) +
ylab("Predicted Income") +
xlab("Alter (zentriert)") +
labs(title = "Predicted Values mit 95%-KI") +
scale_colour_grey()
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
plottig_7
## Forest Plot
ha4_df <- bind_cols(bind_rows(as.data.frame(coef(summary(ha4)))),bind_rows(as.data.frame(names(ha4$coefficients))))
ha4_df$upper <- ha4_df$Estimate + 1.96* ha4_df$`Std. Error`
ha4_df$lower <- ha4_df$Estimate - 1.96* ha4_df$`Std. Error`
plottig_8 <- ggplot(ha4_df, aes(y=`names(ha4$coefficients)`, x = Estimate, color=`names(ha4$coefficients)`)) +
geom_point() +
geom_errorbarh(aes(xmin=lower, xmax=upper), height=0.3) +
ylab("Name(Koeffizient)") +
xlab("Wert(Koeffizient)") +
labs(title= "Koeffizienten mit 95%-KI") +
geom_vline(xintercept=0, linetype="dashed") +
scale_x_continuous(breaks=seq(-40000,40000,10000))+
theme(plot.title = element_text(hjust = 0.5)) +
theme(legend.position = "none")
plottig_8
Folgende Aufgaben stammen aus dem Kohler und Kreuter Buch (2016, S. 404-405):
webuse nlsw88, clear
library(Counterfactual)
## Datensatz nun als nlsw88 gespeichert
data(nlsw88)
union
gegen folgende unabhängigen Variablen:tenure
(zentriert)age
(zentriert)collgrad
race
nlsw88$age_centr <- nlsw88$age - mean(nlsw88$age)
nlsw88$tenure_centr <- nlsw88$tenure - mean(nlsw88$tenure, na.rm=TRUE)
log_reg <- glm(union ~ tenure_centr + age_centr + collgrad + as.factor(race), family = "binomial", data=nlsw88)
summary(log_reg)
##
## Call:
## glm(formula = union ~ tenure_centr + age_centr + collgrad + as.factor(race),
## family = "binomial", data = nlsw88)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.3027 -0.7637 -0.6447 -0.5682 1.9521
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.439307 0.077964 -18.461 < 2e-16 ***
## tenure_centr 0.048449 0.009342 5.186 2.15e-07 ***
## age_centr 0.004898 0.018115 0.270 0.786875
## collgrad 0.500384 0.121307 4.125 3.71e-05 ***
## as.factor(race)2 0.445773 0.120416 3.702 0.000214 ***
## as.factor(race)3 0.592559 0.443986 1.335 0.181996
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2085.4 on 1867 degrees of freedom
## Residual deviance: 2024.8 on 1862 degrees of freedom
## (378 observations deleted due to missingness)
## AIC: 2036.8
##
## Number of Fisher Scoring iterations: 4
## Erster Weg
log_reg_up <- update(log_reg,na.action=na.exclude)
nlsw88$yhat <- predict.glm(log_reg_up, type = "response")
## Zweiter Weg
yhat <- as.data.frame(predict.glm(log_reg, type = "response"))
nlsw88 <- merge(nlsw88,yhat, by=0, all=TRUE) ## by=0 -> by = row.names
nlsw88 <- rename(nlsw88, yhat_2 = `predict.glm(log_reg, type = "response")`)
nlsw88 <- nlsw88[order(nlsw88$idcode),]
head(nlsw88)[,c(21:22)]
## yhat yhat_2
## 1 0.2620566 0.2620566
## 1112 0.2612766 0.2612766
## 1470 NA NA
## 1581 0.2451001 0.2451001
## 1692 0.2983826 0.2983826
## 1803 0.1651126 0.1651126
all.equal(nlsw88$yhat, nlsw88$yhat_2)
## [1] TRUE
## Wegen Intercept automatisch für Weiße!
exp(log_reg$coefficients[["(Intercept)"]])
## [1] 0.2370919
exp(log_reg$coefficients[["(Intercept)"]] + log_reg$coefficients[["as.factor(race)2"]]*1 + log_reg$coefficients[["collgrad"]]*1)
## [1] 0.6106993
exp(log_reg$coefficients[["(Intercept)"]] + log_reg$coefficients[["collgrad"]]*1)
## [1] 0.3910487
## Tenure Centered
newdata <- data.frame(collgrad = 0, age_centr = 0, tenure_centr = c(-20:20), race = 1)
predict_5<- bind_cols(tenure_centr = c(-20:20), as.data.frame(predict.glm(log_reg, newdata, type="response", se.fit = TRUE)))
predict_5$upper <- predict_5$fit + 1.96 * predict_5$se.fit
predict_5$lower <- predict_5$fit - 1.96 * predict_5$se.fit
## Für tatsächliche Jahre
predict_years <- data.frame(collgrad = 0, age_centr = 0, tenure_centr = c(0:40), race = 1)
predict_years$tenure_centr <- predict_years$tenure_centr - mean(nlsw88$tenure, na.rm=TRUE)
pr <- bind_cols(data.frame(tenure = predict_years$tenure_centr), as.data.frame(predict.glm(log_reg, predict_years, type="response", se.fit = TRUE)))
pr$tenure <- pr$tenure + mean(nlsw88$tenure, na.rm=TRUE)
pr$upper <- pr$fit + 1.96 * pr$se.fit
pr$lower <- pr$fit - 1.96 * pr$se.fit
## PLOTS
plot_log <- ggplot(predict_5, aes(y=fit, x = tenure_centr)) +
geom_line() +
geom_point() +
geom_errorbar(aes(ymin=lower, ymax=upper),width = 0.5)+
scale_x_continuous(breaks=seq(-20,20,10)) +
ylab("Pr(union=1)") +
xlab("Tenure (centered)") +
labs(title = "Geschätzte Wahrscheinlichkeiten mit 95%-KI") +
scale_colour_grey() +
geom_hline(yintercept=0, linetype="dashed", color = "firebrick")
plot_log
plot_log_2 <- ggplot(predict_5, aes(y=fit, x = tenure_centr)) +
geom_line() +
geom_line(aes(y = upper), linetype = 2) +
geom_line(aes(y = lower), linetype = 2) +
ylab("Pr(union=1)") +
xlab("Tenure (centered)") +
labs(title = "Geschätzte Wahrscheinlichkeiten mit 95%-KI") +
scale_colour_grey() +
geom_hline(yintercept=0, linetype="dashed", color = "firebrick")
plot_log_2
plot_log_3 <- ggplot(pr, aes(y=fit, x = tenure)) +
geom_line() +
geom_point() +
geom_errorbar(aes(ymin=lower, ymax=upper),width = 0.5)+
scale_x_continuous(breaks=seq(0,40,10)) +
ylab("Pr(union=1)") +
xlab("Tenure (years)") +
labs(title = "Geschätzte Wahrscheinlichkeiten mit 95%-KI") +
scale_colour_grey() +
geom_hline(yintercept=0, linetype="dashed", color = "firebrick") +
theme(plot.title = element_text(hjust = 0.5))
plot_log_3
newdata_2 <- data.frame(collgrad = 1, age_centr = 0, tenure_centr = c(-20:20), race = 1)
predict_6<- bind_cols(tenure_centr = c(-20:20), as.data.frame(predict.glm(log_reg, newdata_2, type="response", se.fit = TRUE)))
predict_6$upper <- predict_6$fit + 1.96 * predict_6$se.fit
predict_6$lower <- predict_6$fit - 1.96 * predict_6$se.fit
plot_log_4 <- ggplot(predict_6, aes(y=fit, x = tenure_centr)) +
geom_line() +
geom_point() +
geom_errorbar(aes(ymin=lower, ymax=upper),width = 0.5)+
scale_x_continuous(breaks=seq(-20,20,10)) +
ylab("Pr(union=1)") +
xlab("Tenure (centered)") +
labs(title = "Geschätzte Wahrscheinlichkeiten mit 95%-KI") +
scale_colour_grey() +
geom_hline(yintercept=0, linetype="dashed", color = "firebrick")
plot_log_4
comb <- bind_rows(predict_5, predict_6)
comb$collgrad <- ""
comb[c(1:41), 7] <- "n.collgrad"
comb[c(42:82), 7] <- "collgrad"
plot_log_5 <- ggplot(comb, aes(y=fit, x = tenure_centr, colour=collgrad)) +
geom_line() +
geom_point(aes(shape = collgrad)) +
geom_errorbar(aes(ymin=lower, ymax=upper),width = 0.5)+
scale_x_continuous(breaks=seq(-20,20,10)) +
ylab("Pr(union=1)") +
xlab("Tenure (zentriert)") +
labs(title = "Geschätzte Wahrscheinlichkeiten mit 95%-KI") +
scale_colour_grey() +
geom_hline(yintercept=0, linetype="dashed", color = "firebrick") +
theme(plot.title = element_text(hjust = 0.5))
plot_log_5
## Des Durchschnitts für Tenure
# Tenure in Gruppen
nlsw88$tenure_gr <- as.numeric(cut(nlsw88$tenure, 8))
# Durchschnitt von Union über Gruppe
tenure_gr <- as.data.frame(aggregate(x = nlsw88$union, by = list(tenure_gr=nlsw88$tenure_gr), FUN = mean, na.rm=TRUE))
nlsw88 <- merge(nlsw88,tenure_gr, all = TRUE)
nlsw88 <- rename(nlsw88, mean_union = x)
nlsw88$tenure_gr <- as.numeric(nlsw88$tenure_gr)
nlsw88$mean_union <- as.numeric(nlsw88$mean_union)
## Plots
plot(nlsw88$tenure, nlsw88$union , xaxt='n', ann=FALSE)
par(new=TRUE)
plot(nlsw88$tenure_gr, nlsw88$mean_union, type="b")
axis(side = 4)
# Smoothing über Union Zugehörigkeit und Tenure
func_plot <- ggplot(nlsw88, aes(y=union, x=tenure)) +
geom_point(shape=1) +
geom_smooth(aes(x=tenure, y=union)) +
geom_line(aes(x=tenure, y=mean_union))
# Smoothing über Mean(Union Zugehörigkeit) und Tenure
func_plot_2 <- ggplot(nlsw88, aes(y=union, x=tenure)) +
geom_point(shape=1) +
geom_smooth(aes(x=tenure, y=mean_union)) +
geom_line(aes(x=tenure, y=mean_union))
invisible(library(ggpubr))
## Registered S3 methods overwritten by 'car':
## method from
## influence.merMod lme4
## cooks.distance.influence.merMod lme4
## dfbeta.influence.merMod lme4
## dfbetas.influence.merMod lme4
ggarrange(func_plot, func_plot_2,
labels = c("A" , "B"),
ncol = 2, nrow = 1)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 378 rows containing non-finite values (stat_smooth).
## Warning: Removed 378 rows containing missing values (geom_point).
## Warning: Removed 15 row(s) containing missing values (geom_path).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 15 rows containing non-finite values (stat_smooth).
## Warning: Removed 378 rows containing missing values (geom_point).
## Warning: Removed 15 row(s) containing missing values (geom_path).
## Der Effekte
plot(nlsw88$tenure_centr, log_reg$coefficients[["(Intercept)"]]+log_reg$coefficients[["tenure_centr"]]*nlsw88$tenure_centr, type="b", xlab = "Tenure (zentriert)", ylab = "Effect(Log-Odds)")
estat classification
.## Über vohergesagte Wahrscheinlichkeiten
nlsw88$yhat_indic <- ifelse(nlsw88$yhat<0.5, 0, 1)
klasifikation <- table(nlsw88$yhat_indic, nlsw88$union)
klasifikation
##
## 0 1
## 0 1401 456
## 1 7 4
prop.table(klasifikation, 1)
##
## 0 1
## 0 0.7544426 0.2455574
## 1 0.6363636 0.3636364
prop.table(klasifikation, 2)
##
## 0 1
## 0 0.995028409 0.991304348
## 1 0.004971591 0.008695652
## Zweiter Weg
classDF <- data.frame(response = nlsw88$union, predicted = nlsw88$yhat_indic)
xtabs(~ predicted + response, data = classDF)
## response
## predicted 0 1
## 0 1401 456
## 1 7 4
## Ahmen wir den Stata Befehl nach!
# Tabelle
class_1 <- as.matrix(klasifikation)
rownames(class_1) <- c("Predic. 0", "Predic. 1")
colnames(class_1) <- c("True 0", "True 1")
# Sensitivity (true positives)
class_2 <- (class_1[2,2]/(class_1[2,2]+class_1[1,2]))*100
names(class_2) <- "Sensitivity %"
# Specificity (true negatives)
class_3 <- (class_1[1,1]/(class_1[1,1]+class_1[2,1]))*100
names(class_3) <- "Specificity %"
# False Positives // Einfacher 100 - Sensitivity
class_4 <- (class_1[2,1]/(class_1[1,1]+class_1[2,1]))*100
names(class_4) <- "False Positives"
# False Negatives // Einfacher 100 - Specificity
class_5 <- (class_1[1,2]/(class_1[2,2]+class_1[1,2]))*100
names(class_5) <- "False Negatives"
# R²-Count - Correctly Classified
class_6 <- ((class_1[1,1]+class_1[2,2])/sum(class_1))*100
names(class_6) <- "R²-Count"
# Adjusted R²-Count - Correctly Classified
class_7 <- (((class_1[1,1]+class_1[2,2])-max(colSums(class_1)))/((sum(class_1))-max(colSums(class_1))))*100
names(class_7) <- "Adj. R²-Count"
estat_classification <- list(class_1,class_2,class_3, class_4, class_5, class_6, class_7)
estat_classification
## [[1]]
##
## True 0 True 1
## Predic. 0 1401 456
## Predic. 1 7 4
##
## [[2]]
## Sensitivity %
## 0.8695652
##
## [[3]]
## Specificity %
## 99.50284
##
## [[4]]
## False Positives
## 0.4971591
##
## [[5]]
## False Negatives
## 99.13043
##
## [[6]]
## R²-Count
## 75.21413
##
## [[7]]
## Adj. R²-Count
## -0.6521739
## Oder mit meinem eigens geschriebenen Paket. Bitte an o.g. Email kontaktieren für Source File
library(estatclass)
estat_class(log_reg, nlsw88$union, 0.5)
## [[1]]
## dep.var
## predictions True 0 True 1
## Predic. 0 1401 456
## Predic. 1 7 4
##
## [[2]]
## Sensitivity or true positive rate (TPR) %
## 0.8695652
##
## [[3]]
## Specificity or true negative rate (TNR) %
## 99.50284
##
## [[4]]
## Miss rate or false negative rate (FNR) %
## 99.13043
##
## [[5]]
## Fall-out or false positive rate (FPR) %
## 0.4971591
##
## [[6]]
## Precision or positive predictive value (PPV) %
## 36.36364
##
## [[7]]
## False discovery rate (FDR) %
## 63.63636
##
## [[8]]
## R²-Count or accuracy (ACC) %
## 75.21413
##
## [[9]]
## Adj. R²-Count % (Long 1997: 108)
## -0.6521739
extat_class(log_reg, nlsw88$union, 0.5)
## [[1]]
## dep.var
## predictions True 0 True 1
## Predic. 0 1401 456
## Predic. 1 7 4
##
## [[2]]
## Sensitivity or true positive rate (TPR) %
## 0.8695652
##
## [[3]]
## Specificity or true negative rate (TNR) %
## 99.50284
##
## [[4]]
## miss rate or false negative rate (FNR) %
## 99.13043
##
## [[5]]
## fall-out or false positive rate (FPR) %
## 0.4971591
##
## [[6]]
## Precision or positive predictive value (PPV) %
## 36.36364
##
## [[7]]
## false discovery rate (FDR) %
## 63.63636
##
## [[8]]
## R²-Count or accuracy (ACC) %
## 75.21413
##
## [[9]]
## Adj. R²-Count % (Long 1997: 108)
## -0.6521739
##
## [[10]]
## F1 score
## 1.698514
##
## [[11]]
## balanced accuracy (BA) or balanced R²-Count %
## 50.1862
##
## [[12]]
## Matthews correlation coefficient (MCC)
## 0.02096982
##
## [[13]]
## Fowlkes–Mallows index (FM)
## 5.623216
##
## [[14]]
## informedness or bookmaker informedness (BM)
## 0.3724061
##
## [[15]]
## markedness (MK) or deltaP
## 11.8079
\(\mathcal{X}^2_{\mathcal{L}(Diff)} = -2(ln\mathcal{L}_{ohne} - ln\mathcal{L}_{mit})\)
full_model <- log_reg
reduced_model <-glm(union ~ tenure_centr + collgrad + as.factor(race), family = "binomial", data=nlsw88)
#### Ohne Extra Paket
chisqr_diff <- -2*(as.numeric(logLik(reduced_model))-as.numeric(logLik(log_reg)))
chisqr_diff
## [1] 0.07307636
# Geringe Differenz = Kaum Verbesserung
##Berechnung der DF für die Differenz der Modelle (können Differnz der Koeffizientenzahl nehmen =1)
length(coef(full_model)) - length(coef(reduced_model))
## [1] 1
pchisq(chisqr_diff,df=1,lower.tail=FALSE) ## p-Value = 0.7869092, insignifikant
## [1] 0.7869092
#### Mit Packet
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
lrtest (reduced_model, full_model)
## Likelihood ratio test
##
## Model 1: union ~ tenure_centr + collgrad + as.factor(race)
## Model 2: union ~ tenure_centr + age_centr + collgrad + as.factor(race)
## #Df LogLik Df Chisq Pr(>Chisq)
## 1 5 -1012.4
## 2 6 -1012.4 1 0.0731 0.7869
Der \(Pr>\mathcal{X}^2\) Wert ist weitaus größer als 0,05. Die Null-Hypothese (\(H_0\)), welche hier annimmt, dass der Effekt des Alters in der Grundgesamtheit 0 beträgt, kann nicht verworfen werden. Die Wahrscheinlichkeit jene Log-Likelihood für das Modell zu erhalten, wenn der Effekt auf Null gesetzt wird, ist knapp 79%. Dies ist ein hoher Wert. Dieser Fakt, als auch die relativ geringe Differenz in den Log-Likehoods der beiden Modelle (welche letztlich jenen hohen \(Pr>\mathcal{X}^2\) Wert beanstandet hat), zeigt, dass das Hinzunehmen des Alters kaum zur Verbesserung des Modelfits beigetragen hat.
Formeln aus Hosmer, Lemeshow und Sturdivant: Appliead Logistic Regression (2013, S. 191)
Pearson-Residuen: \(r_j\)
Standardisierte Pearson-Residuen: \(r_{sj}=\frac{r_j}{\sqrt{1-h_j}}\)
Delta-Chi²: \(\Deltaχ_j^2=\frac{r_j^2}{{1-h_j}}=r_{sj}^2\)
### Ohne Packet
#Perason Residuen (r)
pr <- residuals.glm(log_reg, "pearson")
# Leverage (H)
h <- hatvalues(log_reg)
# Predicted probabilities
yhat <- predict.glm(log_reg, type="response")
# Zusammenführung der Daten
residuals <- bind_cols(as.data.frame(yhat), as.data.frame(pr), as.data.frame(h))
# Aggregate um für die vielfachen yhats einheitliche Werte zu gewinnen
# Zuvor die Anzahl der einzelnen Werte (n) für yhat sichern
n <- count(residuals, vars = c(yhat))
n <- rename(n, yhat = vars)
residuals <- aggregate(residuals, by=list(yhat), FUN=mean)
# Zusammenführung aller Informationen
residuals <- merge(residuals, n, all = TRUE)
residuals <- residuals[,-2]
#Standardisierte Pearson-Residuen
residuals$spr <- residuals$pr/sqrt(1 - residuals$h)
#Delta-Chi²
residuals$dChisq <- (residuals$pr)^2/1 - residuals$h
#Wir müssen mit der Zahl (n) der yhats multiplizieren, weil wir durch aggregate durch n dividiert haben
residuals$dChisq <- residuals$dChisq*residuals$n
# Plotten - Wie markieren wir aber einflussreiche Fälle?
plot(residuals$yhat, residuals$dChisq)
id_residuals <- merge(residuals, nlsw88, all=TRUE)
## Notiz: Durch das mergen, differenzieren sich die Fälle wieder auf
plot(id_residuals$yhat, id_residuals$dChisq)
text(id_residuals$yhat, id_residuals$dChisq, labels=id_residuals$Row.names, cex= 0.7, pos=4)
##Schöner mit GGPlot - Alle mit dChisq größer als 6
ggplot(id_residuals, aes(y=dChisq, x=yhat, label=Row.names)) +
geom_point() +
geom_text(aes(label=ifelse(dChisq>6, Row.names,'')), hjust=0, vjust=0)
## Warning: Removed 378 rows containing missing values (geom_point).
## Warning: Removed 378 rows containing missing values (geom_text).
# Mit ggrepel
library(ggrepel)
ggplot(id_residuals, aes(y=dChisq, x=yhat, label=Row.names)) +
geom_point() +
geom_label_repel(aes(label=ifelse(dChisq>6, Row.names,'')))
## Warning: Removed 378 rows containing missing values (geom_point).
## Warning: Removed 378 rows containing missing values (geom_label_repel).
#### Mit Packet
library(LogisticDx)
dx <- dx(log_reg)
plot(dx$P, dx$dChisq)
# Interessant
library(boot)
glm.diag.plots(log_reg, glmdiag = glm.diag(log_reg))
industry
). Was schließen Sie daraus? Führen Sie eine Kontrollvariable mit den Kategorien Urproduktion, industrieller Sektor, Dienstleistungssektor in das Modell ein und interpretieren Sie die Veränderung der geschätzten Koeffizienten.ggplot(id_residuals, aes(y=dChisq, x=yhat, label=Row.names, color=industry)) +
geom_point() +
geom_label_repel(aes(label=ifelse(dChisq>6, Row.names,'')))
## Warning: Removed 378 rows containing missing values (geom_point).
## Warning: Removed 378 rows containing missing values (geom_label_repel).
id_residuals$ind[id_residuals$industry==1] <- "Ag/Forestry/Fisheries"
id_residuals$ind[id_residuals$industry==2] <- "Mining"
id_residuals$ind[id_residuals$industry==3] <- "Construction"
id_residuals$ind[id_residuals$industry==4] <- "Manufacturing"
id_residuals$ind[id_residuals$industry==5] <- "Transport/Comm/Utility"
id_residuals$ind[id_residuals$industry==6] <- "Wholesale/Retail Trade"
id_residuals$ind[id_residuals$industry==7] <- "Finance/Ins/Real Estate"
id_residuals$ind[id_residuals$industry==8] <- "Business/Repair Svc"
id_residuals$ind[id_residuals$industry==9] <- "Personal Services"
id_residuals$ind[id_residuals$industry==10] <- "Entertainment/Rec Svc"
id_residuals$ind[id_residuals$industry==11] <- "Professional Services"
id_residuals$ind[id_residuals$industry==12] <- "Public Administration"
ggplot(id_residuals, aes(y=dChisq, x=yhat, label=Row.names, color=ind)) +
geom_point() +
geom_label_repel(aes(label=ifelse(dChisq>6, Row.names,'')))
## Warning: Removed 378 rows containing missing values (geom_point).
## Warning: Removed 378 rows containing missing values (geom_label_repel).
barplot <- dplyr::select(id_residuals, dChisq, industry)
barplot<- aggregate(x=list(dChisq=barplot$dChisq), by=list(industry=barplot$industry), FUN=mean, na.rm=TRUE)
barplot$ind[barplot$industry==1] <- "Ag/Forestry/Fisheries"
barplot$ind[barplot$industry==2] <- "Mining"
barplot$ind[barplot$industry==3] <- "Construction"
barplot$ind[barplot$industry==4] <- "Manufacturing"
barplot$ind[barplot$industry==5] <- "Transport/Comm/Utility"
barplot$ind[barplot$industry==6] <- "Wholesale/Retail Trade"
barplot$ind[barplot$industry==7] <- "Finance/Ins/Real Estate"
barplot$ind[barplot$industry==8] <- "Business/Repair Svc"
barplot$ind[barplot$industry==9] <- "Personal Services"
barplot$ind[barplot$industry==10] <- "Entertainment/Rec Svc"
barplot$ind[barplot$industry==11] <- "Professional Services"
barplot$ind[barplot$industry==12] <- "Public Administration"
barplot$indus <- as.character(barplot$industry)
ggplot(barplot, aes(y=dChisq, x=reorder(indus, dChisq), fill=ind)) +
geom_bar(stat = "identity")
nlsw88$ind_cut[between(nlsw88$industry,1,2)] <- "Urproduktion"
nlsw88$ind_cut[between(nlsw88$industry,3,6)] <- "Industrieller Sektor"
nlsw88$ind_cut[between(nlsw88$industry,7,12)] <- "Dienstleistungssektor"
log_reg <- glm(union ~ tenure_centr + age_centr + collgrad + as.factor(race) + as.factor(ind_cut), family = "binomial", data=nlsw88)
summary(log_reg)
##
## Call:
## glm(formula = union ~ tenure_centr + age_centr + collgrad + as.factor(race) +
## as.factor(ind_cut), family = "binomial", data = nlsw88)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.3007 -0.7674 -0.6411 -0.5578 2.1502
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.405677 0.093280 -15.069 < 2e-16
## tenure_centr 0.047671 0.009399 5.072 3.94e-07
## age_centr 0.003630 0.018182 0.200 0.841737
## collgrad 0.477159 0.126119 3.783 0.000155
## as.factor(race)2 0.460152 0.120759 3.811 0.000139
## as.factor(race)3 0.587348 0.443692 1.324 0.185578
## as.factor(ind_cut)Industrieller Sektor -0.076480 0.119381 -0.641 0.521760
## as.factor(ind_cut)Urproduktion -0.587240 0.774654 -0.758 0.448411
##
## (Intercept) ***
## tenure_centr ***
## age_centr
## collgrad ***
## as.factor(race)2 ***
## as.factor(race)3
## as.factor(ind_cut)Industrieller Sektor
## as.factor(ind_cut)Urproduktion
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2071.3 on 1854 degrees of freedom
## Residual deviance: 2010.4 on 1847 degrees of freedom
## (391 observations deleted due to missingness)
## AIC: 2026.4
##
## Number of Fisher Scoring iterations: 4