::p_load(tidyverse) pacman
More R
Homework
<- c(0,3.7,0,3.7,3.9,3.8,3.8,3.8,3.8,3.7,3.8)
x stem(x)
The decimal point is at the |
0 | 00
1 |
2 |
3 | 777888889
Demo with more data sets
Fatal Police Shootings
load("fatal_police_shootings.rda")
ls()
[1] "fatal_police_shootings" "uniqdashc" "wideScreen"
[4] "x"
str(fatal_police_shootings)
'data.frame': 6421 obs. of 12 variables:
$ date : chr "2015-01-02" "2015-01-02" "2015-01-03" "2015-01-04" ...
$ manner_of_death : chr "shot" "shot" "shot and Tasered" "shot" ...
$ armed : chr "gun" "gun" "unarmed" "toy weapon" ...
$ age : int 53 47 23 32 39 18 22 35 34 47 ...
$ gender : chr "M" "M" "M" "M" ...
$ race : chr "A" "W" "H" "W" ...
$ city : chr "Shelton" "Aloha" "Wichita" "San Francisco" ...
$ state : chr "WA" "OR" "KS" "CA" ...
$ signs_of_mental_illness: chr "True" "False" "False" "True" ...
$ threat_level : chr "attack" "attack" "other" "attack" ...
$ flee : chr "Not fleeing" "Not fleeing" "Not fleeing" "Not fleeing" ...
$ body_camera : chr "False" "False" "False" "False" ...
<- fatal_police_shootings
df $date <- ymd(df$date)
dfstr(df)
'data.frame': 6421 obs. of 12 variables:
$ date : Date, format: "2015-01-02" "2015-01-02" ...
$ manner_of_death : chr "shot" "shot" "shot and Tasered" "shot" ...
$ armed : chr "gun" "gun" "unarmed" "toy weapon" ...
$ age : int 53 47 23 32 39 18 22 35 34 47 ...
$ gender : chr "M" "M" "M" "M" ...
$ race : chr "A" "W" "H" "W" ...
$ city : chr "Shelton" "Aloha" "Wichita" "San Francisco" ...
$ state : chr "WA" "OR" "KS" "CA" ...
$ signs_of_mental_illness: chr "True" "False" "False" "True" ...
$ threat_level : chr "attack" "attack" "other" "attack" ...
$ flee : chr "Not fleeing" "Not fleeing" "Not fleeing" "Not fleeing" ...
$ body_camera : chr "False" "False" "False" "False" ...
summary(df$date)
Min. 1st Qu. Median Mean 3rd Qu. Max.
"2015-01-02" "2016-08-16" "2018-03-27" "2018-04-02" "2019-11-29" "2021-07-04"
# stem(df$date)
stem(as.integer(df$date))
The decimal point is 2 digit(s) to the right of the |
164 | 44444444444444444444455555555555555555555555566666666666666666666666+73
165 | 00000000000000000000000000111111111111111111111111111111111112222222+187
166 | 00000000000000000000001111111111111111111112222222222222222222222222+206
167 | 00000000000000000000000000000001111111111111111111122222222222222222+194
168 | 00000000000000000000111111111111111111111112222222222222222222222222+199
169 | 00000000000000000000000011111111111111111111111112222222222222222222+181
170 | 00000000000000001111111111111111111111111111112222222222222222222222+172
171 | 00000000000000000000000111111111111111111111111111122222222222222222+185
172 | 00000000000000000000000000000000011111111111111111111111111111111111+186
173 | 00000000000000000000000000000111111111111111111111112222222222222222+198
174 | 00000000000000000000000000011111111111111111111111111222222222222222+186
175 | 00000000000000000001111111111111111111111222222222222222222222222222+202
176 | 00000000000000000000000000000000000000001111111111111111111111111111+228
177 | 00000000000000000000000000000011111111111111111111111112222222222222+169
178 | 00000000000000000000111111111111111111111111122222222222222222222222+155
179 | 00000000000000000000000000000000000001111111111111111111111111111111+197
180 | 00000000000000000000000000000011111111111111112222222222222222222222+185
181 | 00000000000000000001111111111111111111111111111111111222222222222222+185
182 | 00000000000000000000000000000111111111111111111111111111222222222222+213
183 | 00000000000000000000000111111111111111111111111122222222222222222222+207
184 | 00000000000000000000000000011111111111111111111111111111111111111111+194
185 | 00000000000000000000001111111111111111111111222222222222222222222233+192
186 | 00000000000000000000000000011111111111111111111111111111111222222222+188
187 | 00000000000000000000000000001111111111111111112222222222222222222222+169
188 | 00000000000000000000000000000000000111111111111111
str(df)
'data.frame': 6421 obs. of 12 variables:
$ date : Date, format: "2015-01-02" "2015-01-02" ...
$ manner_of_death : chr "shot" "shot" "shot and Tasered" "shot" ...
$ armed : chr "gun" "gun" "unarmed" "toy weapon" ...
$ age : int 53 47 23 32 39 18 22 35 34 47 ...
$ gender : chr "M" "M" "M" "M" ...
$ race : chr "A" "W" "H" "W" ...
$ city : chr "Shelton" "Aloha" "Wichita" "San Francisco" ...
$ state : chr "WA" "OR" "KS" "CA" ...
$ signs_of_mental_illness: chr "True" "False" "False" "True" ...
$ threat_level : chr "attack" "attack" "other" "attack" ...
$ flee : chr "Not fleeing" "Not fleeing" "Not fleeing" "Not fleeing" ...
$ body_camera : chr "False" "False" "False" "False" ...
$fleeing <- ifelse(df$flee=="Not fleeing",0,1)
df<- glm(fleeing ~ age,data=df,family="binomial")
m summary(m)
Call:
glm(formula = fleeing ~ age, family = "binomial", data = df)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 0.746484 0.083052 8.988 <2e-16 ***
age -0.032564 0.002205 -14.771 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 8217.5 on 6135 degrees of freedom
Residual deviance: 7981.4 on 6134 degrees of freedom
(285 observations deleted due to missingness)
AIC: 7985.4
Number of Fisher Scoring iterations: 4
table(df$race)
A B H N O W
666 105 1528 1066 90 47 2919
|> ggplot(aes(race,age)) +
df geom_boxplot() +
geom_jitter(alpha=0.1)
Warning: Removed 285 rows containing non-finite outside the scale range
(`stat_boxplot()`).
Warning: Removed 285 rows containing missing values or values outside the scale range
(`geom_point()`).
|> count(armed) |> arrange(desc(n)) df
armed n
1 gun 3699
2 knife 936
3 unarmed 413
4 toy weapon 219
5 208
6 vehicle 204
7 undetermined 173
8 unknown weapon 77
9 machete 49
10 Taser 31
11 ax 24
12 sword 23
13 gun and knife 21
14 baseball bat 20
15 hammer 18
16 gun and vehicle 16
17 metal pipe 16
18 screwdriver 16
19 hatchet 14
20 BB gun 13
21 box cutter 13
22 sharp object 13
23 gun and car 11
24 crossbow 9
25 scissors 9
26 piece of wood 7
27 pipe 7
28 rock 7
29 shovel 7
30 vehicle and gun 7
31 baton 6
32 meat cleaver 6
33 blunt object 5
34 crowbar 5
35 metal object 5
36 straight edge razor 5
37 chair 4
38 glass shard 4
39 metal pole 4
40 pick-axe 4
41 samurai sword 4
42 tire iron 4
43 Airsoft pistol 3
44 beer bottle 3
45 chain 3
46 gun and machete 3
47 guns and explosives 3
48 metal stick 3
49 pellet gun 3
50 pole 3
51 brick 2
52 chain saw 2
53 flashlight 2
54 garden tool 2
55 hatchet and gun 2
56 incendiary device 2
57 lawn mower blade 2
58 metal hand tool 2
59 pepper spray 2
60 pitchfork 2
61 pole and knife 2
62 spear 2
63 BB gun and vehicle 1
64 air conditioner 1
65 air pistol 1
66 barstool 1
67 baseball bat and bottle 1
68 baseball bat and fireplace poker 1
69 baseball bat and knife 1
70 bean-bag gun 1
71 binoculars 1
72 bottle 1
73 bow and arrow 1
74 car, knife and mace 1
75 carjack 1
76 chainsaw 1
77 claimed to be armed 1
78 contractor's level 1
79 cordless drill 1
80 fireworks 1
81 flagpole 1
82 grenade 1
83 gun and sword 1
84 hand torch 1
85 ice pick 1
86 knife and vehicle 1
87 machete and gun 1
88 metal rake 1
89 microphone 1
90 motorcycle 1
91 nail gun 1
92 oar 1
93 pen 1
94 railroad spikes 1
95 stapler 1
96 vehicle and machete 1
97 walking stick 1
98 wasp spray 1
99 wrench 1
Is there a difference between threat levels if a body camera is turned on?
<-table(df$threat_level,df$body_camera)
tblchisq.test(tbl)
Pearson's Chi-squared test
data: tbl
X-squared = 10.081, df = 2, p-value = 0.00647
Sleep in mammals
load("mammals.rda")
ls()
[1] "df" "fatal_police_shootings" "m"
[4] "mammals" "tbl" "uniqdashc"
[7] "wideScreen" "x"
str(mammals)
tibble [62 × 11] (S3: tbl_df/tbl/data.frame)
$ species : Factor w/ 62 levels "Africanelephant",..: 1 2 3 4 5 6 7 8 9 10 ...
$ body_wt : num [1:62] 6654 1 3.38 0.92 2547 ...
$ brain_wt : num [1:62] 5712 6.6 44.5 5.7 4603 ...
$ non_dreaming: num [1:62] NA 6.3 NA NA 2.1 9.1 15.8 5.2 10.9 8.3 ...
$ dreaming : num [1:62] NA 2 NA NA 1.8 0.7 3.9 1 3.6 1.4 ...
$ total_sleep : num [1:62] 3.3 8.3 12.5 16.5 3.9 9.8 19.7 6.2 14.5 9.7 ...
$ life_span : num [1:62] 38.6 4.5 14 NA 69 27 19 30.4 28 50 ...
$ gestation : num [1:62] 645 42 60 25 624 180 35 392 63 230 ...
$ predation : int [1:62] 3 3 1 5 3 4 1 4 1 1 ...
$ exposure : int [1:62] 5 1 1 2 5 4 1 5 2 1 ...
$ danger : int [1:62] 3 3 1 3 4 4 1 4 1 1 ...
<- mammals
df <- lm(total_sleep ~ brain_wt,data=df)
m summary(m)
Call:
lm(formula = total_sleep ~ brain_wt, data = df)
Residuals:
Min 1Q Median 3Q Max
-8.248 -2.610 -0.309 2.189 8.884
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 11.0163240 0.5941893 18.54 < 2e-16 ***
brain_wt -0.0017195 0.0005991 -2.87 0.00578 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 4.339 on 56 degrees of freedom
(4 observations deleted due to missingness)
Multiple R-squared: 0.1282, Adjusted R-squared: 0.1127
F-statistic: 8.238 on 1 and 56 DF, p-value: 0.00578
|> select(total_sleep,species) |> arrange(total_sleep) |> print(n=62) df
# A tibble: 62 × 2
total_sleep species
<dbl> <fct>
1 2.6 Roedeer
2 2.9 Horse
3 3.1 Donkey
4 3.3 Africanelephant
5 3.8 Goat
6 3.8 Sheep
7 3.9 Asianelephant
8 3.9 Cow
9 5.4 Rockhyrax(Procaviahab)
10 5.4 Treehyrax
11 6.1 Genet
12 6.2 Braziliantapir
13 6.2 Grayseal
14 6.6 Rockhyrax(Heterob)
15 8 Man
16 8.2 Guineapig
17 8.3 Africangiantpouchedrat
18 8.4 EasternAmericanmole
19 8.4 Pig
20 8.4 Rabbit
21 8.6 Echidna
22 9.1 Lessershort-tailedshrew
23 9.6 Rhesusmonkey
24 9.7 Chimpanzee
25 9.8 Baboon
26 9.8 Redfox
27 10.3 Deserthedgehog
28 10.3 Starnosedmole
29 10.3 Vervet
30 10.6 Molerat
31 10.7 Europeanhedgehog
32 10.7 Galago
33 10.8 Jaguar
34 10.9 Patasmonkey
35 11 Slowloris
36 11.2 Mountainbeaver
37 12 Gorilla
38 12.5 ArcticFox
39 12.5 Chinchilla
40 12.5 Raccoon
41 12.8 Muskshrew
42 13 Graywolf
43 13.2 Mouse
44 13.2 Rat
45 13.3 Tenrec
46 13.7 Phanlanger
47 13.8 Groundsquirrel
48 14.4 Goldenhamster
49 14.5 Cat
50 15.8 Treeshrew
51 16.5 Arcticgroundsquirrel
52 17 Owlmonkey
53 17.4 Nine-bandedarmadillo
54 18.1 Giantarmadillo
55 19.4 NAmericanopossum
56 19.4 Wateropossum
57 19.7 Bigbrownbat
58 19.9 Littlebrownbat
59 NA Giraffe
60 NA Kangaroo
61 NA Okapi
62 NA Yellow-belliedmarmot
::p_load(GGally)
pacman|> select(-species) |> ggpairs() df
Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 14 rows containing missing values
Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 12 rows containing missing values
Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 4 rows containing missing values
Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 4 rows containing missing values
Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 4 rows containing missing values
Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 14 rows containing missing values
Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 12 rows containing missing values
Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 4 rows containing missing values
Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 4 rows containing missing values
Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 4 rows containing missing values
Warning: Removed 14 rows containing missing values or values outside the scale range
(`geom_point()`).
Removed 14 rows containing missing values or values outside the scale range
(`geom_point()`).
Warning: Removed 14 rows containing non-finite outside the scale range
(`stat_density()`).
Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 14 rows containing missing values
Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 14 rows containing missing values
Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 17 rows containing missing values
Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 18 rows containing missing values
Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 14 rows containing missing values
Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 14 rows containing missing values
Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 14 rows containing missing values
Warning: Removed 12 rows containing missing values or values outside the scale range
(`geom_point()`).
Removed 12 rows containing missing values or values outside the scale range
(`geom_point()`).
Warning: Removed 14 rows containing missing values or values outside the scale range
(`geom_point()`).
Warning: Removed 12 rows containing non-finite outside the scale range
(`stat_density()`).
Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 14 rows containing missing values
Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 15 rows containing missing values
Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 16 rows containing missing values
Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 12 rows containing missing values
Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 12 rows containing missing values
Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 12 rows containing missing values
Warning: Removed 4 rows containing missing values or values outside the scale range
(`geom_point()`).
Removed 4 rows containing missing values or values outside the scale range
(`geom_point()`).
Warning: Removed 14 rows containing missing values or values outside the scale range
(`geom_point()`).
Removed 14 rows containing missing values or values outside the scale range
(`geom_point()`).
Warning: Removed 4 rows containing non-finite outside the scale range
(`stat_density()`).
Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 8 rows containing missing values
Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 8 rows containing missing values
Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 4 rows containing missing values
Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 4 rows containing missing values
Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 4 rows containing missing values
Warning: Removed 4 rows containing missing values or values outside the scale range
(`geom_point()`).
Removed 4 rows containing missing values or values outside the scale range
(`geom_point()`).
Warning: Removed 17 rows containing missing values or values outside the scale range
(`geom_point()`).
Warning: Removed 15 rows containing missing values or values outside the scale range
(`geom_point()`).
Warning: Removed 8 rows containing missing values or values outside the scale range
(`geom_point()`).
Warning: Removed 4 rows containing non-finite outside the scale range
(`stat_density()`).
Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 7 rows containing missing values
Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 4 rows containing missing values
Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 4 rows containing missing values
Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 4 rows containing missing values
Warning: Removed 4 rows containing missing values or values outside the scale range
(`geom_point()`).
Removed 4 rows containing missing values or values outside the scale range
(`geom_point()`).
Warning: Removed 18 rows containing missing values or values outside the scale range
(`geom_point()`).
Warning: Removed 16 rows containing missing values or values outside the scale range
(`geom_point()`).
Warning: Removed 8 rows containing missing values or values outside the scale range
(`geom_point()`).
Warning: Removed 7 rows containing missing values or values outside the scale range
(`geom_point()`).
Warning: Removed 4 rows containing non-finite outside the scale range
(`stat_density()`).
Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 4 rows containing missing values
Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 4 rows containing missing values
Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 4 rows containing missing values
Warning: Removed 14 rows containing missing values or values outside the scale range
(`geom_point()`).
Warning: Removed 12 rows containing missing values or values outside the scale range
(`geom_point()`).
Warning: Removed 4 rows containing missing values or values outside the scale range
(`geom_point()`).
Removed 4 rows containing missing values or values outside the scale range
(`geom_point()`).
Removed 4 rows containing missing values or values outside the scale range
(`geom_point()`).
Warning: Removed 14 rows containing missing values or values outside the scale range
(`geom_point()`).
Warning: Removed 12 rows containing missing values or values outside the scale range
(`geom_point()`).
Warning: Removed 4 rows containing missing values or values outside the scale range
(`geom_point()`).
Removed 4 rows containing missing values or values outside the scale range
(`geom_point()`).
Removed 4 rows containing missing values or values outside the scale range
(`geom_point()`).
Warning: Removed 14 rows containing missing values or values outside the scale range
(`geom_point()`).
Warning: Removed 12 rows containing missing values or values outside the scale range
(`geom_point()`).
Warning: Removed 4 rows containing missing values or values outside the scale range
(`geom_point()`).
Removed 4 rows containing missing values or values outside the scale range
(`geom_point()`).
Removed 4 rows containing missing values or values outside the scale range
(`geom_point()`).
|> select(-species,-gestation,-exposure,-danger,-body_wt,-predation) |> ggpairs() df
Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 14 rows containing missing values
Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 12 rows containing missing values
Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 4 rows containing missing values
Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 4 rows containing missing values
Warning: Removed 14 rows containing missing values or values outside the scale range
(`geom_point()`).
Warning: Removed 14 rows containing non-finite outside the scale range
(`stat_density()`).
Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 14 rows containing missing values
Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 14 rows containing missing values
Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 17 rows containing missing values
Warning: Removed 12 rows containing missing values or values outside the scale range
(`geom_point()`).
Warning: Removed 14 rows containing missing values or values outside the scale range
(`geom_point()`).
Warning: Removed 12 rows containing non-finite outside the scale range
(`stat_density()`).
Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 14 rows containing missing values
Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 15 rows containing missing values
Warning: Removed 4 rows containing missing values or values outside the scale range
(`geom_point()`).
Warning: Removed 14 rows containing missing values or values outside the scale range
(`geom_point()`).
Removed 14 rows containing missing values or values outside the scale range
(`geom_point()`).
Warning: Removed 4 rows containing non-finite outside the scale range
(`stat_density()`).
Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 8 rows containing missing values
Warning: Removed 4 rows containing missing values or values outside the scale range
(`geom_point()`).
Warning: Removed 17 rows containing missing values or values outside the scale range
(`geom_point()`).
Warning: Removed 15 rows containing missing values or values outside the scale range
(`geom_point()`).
Warning: Removed 8 rows containing missing values or values outside the scale range
(`geom_point()`).
Warning: Removed 4 rows containing non-finite outside the scale range
(`stat_density()`).
::p_load(corrgram)
pacman|> select(-species,-gestation,-exposure,-danger,-body_wt,-predation) |> corrgram(upper.panel=panel.pie) df
There are loads of other options for scatterplot matrices, known in the R Graph Gallery as correlograms.
::p_load(ellipse)
pacman::p_load(RColorBrewer)
pacman<- cor(df[,2:7])
data <- brewer.pal(5,"Spectral")
my_colors <- colorRampPalette(my_colors)(100)
my_colors <- order(data[1,])
ord <- data[ord,ord]
data_ord plotcorr(data_ord, col=my_colors[data_ord*50+50], mar=c(1,1,1,1) )
This didn’t work well, so my first guess is that I misspelled something. To check that, I’ll use the above code to reproduce the example in the R Graph Gallery.
<- cor(mtcars)
data <- brewer.pal(5,"Spectral")
my_colors <- colorRampPalette(my_colors)(100)
my_colors <- order(data[1,])
ord <- data[ord,ord]
data_ord plotcorr(data_ord, col=my_colors[data_ord*50+50], mar=c(1,1,1,1) )
This plots correctly, so there must be something else wrong. The next step is to find out what ord
and data_ord
are.
ord
[1] 6 2 3 4 11 7 10 9 8 5 1
data_ord
wt cyl disp hp carb qsec
wt 1.0000000 0.7824958 0.8879799 0.6587479 0.42760594 -0.17471588
cyl 0.7824958 1.0000000 0.9020329 0.8324475 0.52698829 -0.59124207
disp 0.8879799 0.9020329 1.0000000 0.7909486 0.39497686 -0.43369788
hp 0.6587479 0.8324475 0.7909486 1.0000000 0.74981247 -0.70822339
carb 0.4276059 0.5269883 0.3949769 0.7498125 1.00000000 -0.65624923
qsec -0.1747159 -0.5912421 -0.4336979 -0.7082234 -0.65624923 1.00000000
gear -0.5832870 -0.4926866 -0.5555692 -0.1257043 0.27407284 -0.21268223
am -0.6924953 -0.5226070 -0.5912270 -0.2432043 0.05753435 -0.22986086
vs -0.5549157 -0.8108118 -0.7104159 -0.7230967 -0.56960714 0.74453544
drat -0.7124406 -0.6999381 -0.7102139 -0.4487591 -0.09078980 0.09120476
mpg -0.8676594 -0.8521620 -0.8475514 -0.7761684 -0.55092507 0.41868403
gear am vs drat mpg
wt -0.5832870 -0.69249526 -0.5549157 -0.71244065 -0.8676594
cyl -0.4926866 -0.52260705 -0.8108118 -0.69993811 -0.8521620
disp -0.5555692 -0.59122704 -0.7104159 -0.71021393 -0.8475514
hp -0.1257043 -0.24320426 -0.7230967 -0.44875912 -0.7761684
carb 0.2740728 0.05753435 -0.5696071 -0.09078980 -0.5509251
qsec -0.2126822 -0.22986086 0.7445354 0.09120476 0.4186840
gear 1.0000000 0.79405876 0.2060233 0.69961013 0.4802848
am 0.7940588 1.00000000 0.1683451 0.71271113 0.5998324
vs 0.2060233 0.16834512 1.0000000 0.44027846 0.6640389
drat 0.6996101 0.71271113 0.4402785 1.00000000 0.6811719
mpg 0.4802848 0.59983243 0.6640389 0.68117191 1.0000000
Now I see the problem! Everything but but brain_wt
and body_wt
has NA values (three such values, I think).
# help(cor)
<- cor(df[,2:7],use="complete.obs")
data <- brewer.pal(5,"Spectral")
my_colors <- colorRampPalette(my_colors)(100)
my_colors <- order(data[1,])
ord <- data[ord,ord]
data_ord plotcorr(data_ord, col=my_colors[data_ord*50+50], mar=c(1,1,1,1) )
Employment Resumes
load("resume.rda")
ls()
[1] "data" "data_ord" "df"
[4] "fatal_police_shootings" "m" "mammals"
[7] "my_colors" "ord" "resume"
[10] "tbl" "uniqdashc" "wideScreen"
[13] "x"
str(resume)
tibble [4,870 × 30] (S3: tbl_df/tbl/data.frame)
$ job_ad_id : num [1:4870] 384 384 384 384 385 386 386 385 386 386 ...
$ job_city : chr [1:4870] "Chicago" "Chicago" "Chicago" "Chicago" ...
$ job_industry : chr [1:4870] "manufacturing" "manufacturing" "manufacturing" "manufacturing" ...
$ job_type : chr [1:4870] "supervisor" "supervisor" "supervisor" "supervisor" ...
$ job_fed_contractor : num [1:4870] NA NA NA NA 0 0 0 0 0 0 ...
$ job_equal_opp_employer: num [1:4870] 1 1 1 1 1 1 1 1 1 1 ...
$ job_ownership : chr [1:4870] "unknown" "unknown" "unknown" "unknown" ...
$ job_req_any : num [1:4870] 1 1 1 1 1 0 0 1 0 0 ...
$ job_req_communication : num [1:4870] 0 0 0 0 0 0 0 0 0 0 ...
$ job_req_education : num [1:4870] 0 0 0 0 0 0 0 0 0 0 ...
$ job_req_min_experience: chr [1:4870] "5" "5" "5" "5" ...
$ job_req_computer : num [1:4870] 1 1 1 1 1 0 0 1 0 0 ...
$ job_req_organization : num [1:4870] 0 0 0 0 1 0 0 1 0 0 ...
$ job_req_school : chr [1:4870] "none_listed" "none_listed" "none_listed" "none_listed" ...
$ received_callback : num [1:4870] 0 0 0 0 0 0 0 0 0 0 ...
$ firstname : chr [1:4870] "Allison" "Kristen" "Lakisha" "Latonya" ...
$ race : chr [1:4870] "white" "white" "black" "black" ...
$ gender : chr [1:4870] "f" "f" "f" "f" ...
$ years_college : int [1:4870] 4 3 4 3 3 4 4 3 4 4 ...
$ college_degree : num [1:4870] 1 0 1 0 0 1 1 0 1 1 ...
$ honors : int [1:4870] 0 0 0 0 0 1 0 0 0 0 ...
$ worked_during_school : int [1:4870] 0 1 1 0 1 0 1 0 0 1 ...
$ years_experience : int [1:4870] 6 6 6 6 22 6 5 21 3 6 ...
$ computer_skills : int [1:4870] 1 1 1 1 1 0 1 1 1 0 ...
$ special_skills : int [1:4870] 0 0 0 1 0 1 1 1 1 1 ...
$ volunteer : int [1:4870] 0 1 0 1 0 0 1 1 0 1 ...
$ military : int [1:4870] 0 1 0 0 0 0 0 0 0 0 ...
$ employment_holes : int [1:4870] 1 0 0 1 0 0 0 1 0 0 ...
$ has_email_address : int [1:4870] 0 1 0 1 1 0 1 1 0 1 ...
$ resume_quality : chr [1:4870] "low" "high" "low" "high" ...
<- resume
df <- glm(received_callback ~ .,data=df,family="binomial")
m summary(m)
Call:
glm(formula = received_callback ~ ., family = "binomial", data = df)
Coefficients: (3 not defined because of singularities)
Estimate Std. Error z value
(Intercept) -4.372e+00 1.086e+00 -4.027
job_ad_id 9.499e-04 2.730e-04 3.480
job_cityChicago -7.525e-01 2.107e-01 -3.571
job_industryfinance_insurance_real_estate -4.924e-01 3.085e-01 -1.596
job_industrymanufacturing -3.764e-01 2.766e-01 -1.361
job_industryother_service 2.720e-01 2.333e-01 1.166
job_industrytransportation_communication 1.790e-01 3.649e-01 0.490
job_industrywholesale_and_retail_trade -1.188e-01 2.093e-01 -0.568
job_typemanager 1.252e-01 3.308e-01 0.378
job_typeretail_sales 1.692e-01 3.174e-01 0.533
job_typesales_rep -1.520e-01 3.474e-01 -0.437
job_typesecretary -1.089e-01 2.410e-01 -0.452
job_typesupervisor 7.871e-02 3.423e-01 0.230
job_fed_contractor 2.088e-01 2.356e-01 0.886
job_equal_opp_employer 4.899e-01 1.655e-01 2.961
job_ownershipprivate 8.911e-01 3.338e-01 2.669
job_ownershippublic 7.876e-01 3.840e-01 2.051
job_ownershipunknown 6.311e-01 3.762e-01 1.678
job_req_any -1.968e-01 2.542e-01 -0.774
job_req_communication 4.130e-01 2.282e-01 1.810
job_req_education -2.205e-01 3.628e-01 -0.608
job_req_min_experience0 -1.384e+01 1.189e+03 -0.012
job_req_min_experience0.5 9.802e-01 1.191e+00 0.823
job_req_min_experience1 -4.522e-02 4.103e-01 -0.110
job_req_min_experience10 1.916e-01 1.087e+00 0.176
job_req_min_experience2 1.477e-01 2.854e-01 0.517
job_req_min_experience3 -5.928e-01 3.561e-01 -1.665
job_req_min_experience4 3.798e-01 1.122e+00 0.338
job_req_min_experience5 -4.773e-01 4.712e-01 -1.013
job_req_min_experience6 -1.420e+01 1.144e+03 -0.012
job_req_min_experience7 -1.366e+01 6.685e+02 -0.020
job_req_min_experience8 -1.428e+01 8.151e+02 -0.018
job_req_min_experiencesome -4.597e-02 2.186e-01 -0.210
job_req_computer 1.815e-01 2.164e-01 0.839
job_req_organization -1.046e+00 4.596e-01 -2.275
job_req_schoolhigh_school_grad -1.346e+01 4.006e+02 -0.034
job_req_schoolnone_listed NA NA NA
job_req_schoolsome_college 2.346e-01 4.538e-01 0.517
firstnameAllison 1.458e+00 6.522e-01 2.235
firstnameAnne 5.766e-01 7.168e-01 0.804
firstnameBrad 1.779e+00 7.597e-01 2.341
firstnameBrendan 9.361e-01 8.542e-01 1.096
firstnameBrett 1.042e+00 8.557e-01 1.218
firstnameCarrie 1.769e+00 6.589e-01 2.685
firstnameDarnell -1.221e-01 1.201e+00 -0.102
firstnameEbony 1.134e+00 6.729e-01 1.686
firstnameEmily 5.345e-01 7.152e-01 0.747
firstnameGeoffrey 4.044e-01 9.533e-01 0.424
firstnameGreg 1.239e+00 8.688e-01 1.426
firstnameHakim 1.146e+00 8.632e-01 1.328
firstnameJamal 4.176e-01 9.491e-01 0.440
firstnameJay 1.122e+00 8.108e-01 1.383
firstnameJermaine 1.522e-01 1.201e+00 0.127
firstnameJill 6.428e-01 7.049e-01 0.912
firstnameKareem 6.807e-01 9.497e-01 0.717
firstnameKeisha 2.477e-01 8.420e-01 0.294
firstnameKenya 1.211e+00 6.729e-01 1.799
firstnameKristen 1.733e+00 6.508e-01 2.662
firstnameLakisha 1.476e-02 7.863e-01 0.019
firstnameLatonya 1.007e+00 6.761e-01 1.490
firstnameLatoya 1.421e+00 6.600e-01 2.154
firstnameLaurie 1.109e+00 6.791e-01 1.634
firstnameLeroy 1.231e+00 8.603e-01 1.431
firstnameMatthew 1.231e+00 8.074e-01 1.524
firstnameMeredith 1.580e+00 6.620e-01 2.387
firstnameNeil 1.134e+00 8.089e-01 1.402
firstnameRasheed 5.574e-01 9.511e-01 0.586
firstnameSarah 1.366e+00 6.808e-01 2.006
firstnameTamika 1.516e-01 7.327e-01 0.207
firstnameTanisha 8.136e-01 7.052e-01 1.154
firstnameTodd 8.946e-01 8.577e-01 1.043
firstnameTremayne 1.028e+00 8.604e-01 1.195
firstnameTyrone 6.151e-01 8.552e-01 0.719
racewhite NA NA NA
genderm NA NA NA
years_college -1.976e-01 2.111e-01 -0.936
college_degree 7.236e-01 3.634e-01 1.991
honors 5.941e-01 2.515e-01 2.362
worked_during_school 1.318e-01 1.835e-01 0.718
years_experience 2.392e-02 1.510e-02 1.584
computer_skills -1.998e-01 1.988e-01 -1.005
special_skills 5.937e-01 1.684e-01 3.524
volunteer -7.074e-01 2.334e-01 -3.030
military 8.800e-03 2.943e-01 0.030
employment_holes 3.166e-01 1.835e-01 1.725
has_email_address 2.952e-02 3.301e-01 0.089
resume_qualitylow -6.183e-01 3.841e-01 -1.610
Pr(>|z|)
(Intercept) 5.64e-05 ***
job_ad_id 0.000501 ***
job_cityChicago 0.000356 ***
job_industryfinance_insurance_real_estate 0.110484
job_industrymanufacturing 0.173554
job_industryother_service 0.243538
job_industrytransportation_communication 0.623806
job_industrywholesale_and_retail_trade 0.570186
job_typemanager 0.705135
job_typeretail_sales 0.593870
job_typesales_rep 0.661794
job_typesecretary 0.651533
job_typesupervisor 0.818154
job_fed_contractor 0.375382
job_equal_opp_employer 0.003071 **
job_ownershipprivate 0.007604 **
job_ownershippublic 0.040243 *
job_ownershipunknown 0.093435 .
job_req_any 0.438902
job_req_communication 0.070254 .
job_req_education 0.543371
job_req_min_experience0 0.990712
job_req_min_experience0.5 0.410362
job_req_min_experience1 0.912249
job_req_min_experience10 0.860077
job_req_min_experience2 0.604901
job_req_min_experience3 0.095977 .
job_req_min_experience4 0.734989
job_req_min_experience5 0.311109
job_req_min_experience6 0.990096
job_req_min_experience7 0.983696
job_req_min_experience8 0.986023
job_req_min_experiencesome 0.833459
job_req_computer 0.401664
job_req_organization 0.022919 *
job_req_schoolhigh_school_grad 0.973196
job_req_schoolnone_listed NA
job_req_schoolsome_college 0.605183
firstnameAllison 0.025403 *
firstnameAnne 0.421132
firstnameBrad 0.019230 *
firstnameBrendan 0.273127
firstnameBrett 0.223260
firstnameCarrie 0.007263 **
firstnameDarnell 0.919041
firstnameEbony 0.091886 .
firstnameEmily 0.454821
firstnameGeoffrey 0.671455
firstnameGreg 0.153753
firstnameHakim 0.184148
firstnameJamal 0.659911
firstnameJay 0.166518
firstnameJermaine 0.899160
firstnameJill 0.361803
firstnameKareem 0.473528
firstnameKeisha 0.768648
firstnameKenya 0.071973 .
firstnameKristen 0.007762 **
firstnameLakisha 0.985026
firstnameLatonya 0.136226
firstnameLatoya 0.031272 *
firstnameLaurie 0.102348
firstnameLeroy 0.152423
firstnameMatthew 0.127441
firstnameMeredith 0.016999 *
firstnameNeil 0.160781
firstnameRasheed 0.557814
firstnameSarah 0.044888 *
firstnameTamika 0.836099
firstnameTanisha 0.248610
firstnameTodd 0.296926
firstnameTremayne 0.232055
firstnameTyrone 0.471964
racewhite NA
genderm NA
years_college 0.349289
college_degree 0.046439 *
honors 0.018168 *
worked_during_school 0.472695
years_experience 0.113115
computer_skills 0.314907
special_skills 0.000424 ***
volunteer 0.002443 **
military 0.976143
employment_holes 0.084553 .
has_email_address 0.928739
resume_qualitylow 0.107454
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1674.4 on 3101 degrees of freedom
Residual deviance: 1487.3 on 3018 degrees of freedom
(1768 observations deleted due to missingness)
AIC: 1655.3
Number of Fisher Scoring iterations: 15
<- glm(received_callback ~ race,data=df,family="binomial")
m summary(m)
Call:
glm(formula = received_callback ~ race, family = "binomial",
data = df)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -2.67481 0.08251 -32.417 < 2e-16 ***
racewhite 0.43818 0.10732 4.083 4.45e-05 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2726.9 on 4869 degrees of freedom
Residual deviance: 2709.9 on 4868 degrees of freedom
AIC: 2713.9
Number of Fisher Scoring iterations: 5
<- glm(received_callback ~ gender,data=df,family="binomial")
m summary(m)
Call:
glm(formula = received_callback ~ gender, family = "binomial",
data = df)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -2.40901 0.05939 -40.562 <2e-16 ***
genderm -0.12008 0.12859 -0.934 0.35
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2726.9 on 4869 degrees of freedom
Residual deviance: 2726.0 on 4868 degrees of freedom
AIC: 2730
Number of Fisher Scoring iterations: 5
<- table(df$gender,df$race)) (tbl
black white
f 1886 1860
m 549 575
chisq.test(tbl)
Pearson's Chi-squared test with Yates' continuity correction
data: tbl
X-squared = 0.72289, df = 1, p-value = 0.3952
<- table(df$received_callback,df$race)) (tbl
black white
0 2278 2200
1 157 235
chisq.test(tbl)
Pearson's Chi-squared test with Yates' continuity correction
data: tbl
X-squared = 16.449, df = 1, p-value = 4.998e-05
Sleep Deprivation
load("sleep_deprivation.rda")
ls()
[1] "data" "data_ord" "df"
[4] "fatal_police_shootings" "m" "mammals"
[7] "my_colors" "ord" "resume"
[10] "sleep_deprivation" "tbl" "uniqdashc"
[13] "wideScreen" "x"
str(sleep_deprivation)
tibble [1,087 × 2] (S3: tbl_df/tbl/data.frame)
$ sleep : Factor w/ 3 levels "<6",">8","6-8": 1 1 1 1 1 1 1 1 1 1 ...
$ profession: Factor w/ 5 levels "bus / taxi / limo drivers",..: 2 2 2 2 2 2 2 2 2 2 ...
<- sleep_deprivation
df table(df$sleep,df$profession)
bus / taxi / limo drivers control pilots train operators truck drivers
<6 21 35 19 29 35
>8 58 64 51 32 51
6-8 131 193 132 119 117
<- glm(profession ~ sleep,data=df,family="binomial")
m summary(m)
Call:
glm(formula = profession ~ sleep, family = "binomial", data = df)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 1.7262 0.2368 7.288 3.14e-13 ***
sleep>8 -0.4983 0.2800 -1.780 0.0751 .
sleep6-8 -0.2716 0.2559 -1.061 0.2886
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1067.0 on 1086 degrees of freedom
Residual deviance: 1063.5 on 1084 degrees of freedom
AIC: 1069.5
Number of Fisher Scoring iterations: 4
<- glm(sleep ~ profession,data=df,family="binomial")
m summary(m)
Call:
glm(formula = sleep ~ profession, family = "binomial", data = df)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 2.19722 0.23000 9.553 <2e-16 ***
professioncontrol -0.20350 0.29217 -0.697 0.4861
professionpilots 0.06782 0.33314 0.204 0.8387
professiontrain operators -0.54724 0.30660 -1.785 0.0743 .
professiontruck drivers -0.62861 0.29568 -2.126 0.0335 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 831.18 on 1086 degrees of freedom
Residual deviance: 822.22 on 1082 degrees of freedom
AIC: 832.22
Number of Fisher Scoring iterations: 4
$sleep <- ordered(df$sleep, levels = c("<6","6-8",">8"))
df<- glm(sleep ~ profession,data=df,family="binomial")
m summary(m)
Call:
glm(formula = sleep ~ profession, family = "binomial", data = df)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 2.19722 0.23000 9.553 <2e-16 ***
professioncontrol -0.20350 0.29217 -0.697 0.4861
professionpilots 0.06782 0.33314 0.204 0.8387
professiontrain operators -0.54724 0.30660 -1.785 0.0743 .
professiontruck drivers -0.62861 0.29568 -2.126 0.0335 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 831.18 on 1086 degrees of freedom
Residual deviance: 822.22 on 1082 degrees of freedom
AIC: 832.22
Number of Fisher Scoring iterations: 4
<- glm(profession ~ sleep,data=df,family="binomial")
m summary(m)
Call:
glm(formula = profession ~ sleep, family = "binomial", data = df)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 1.46950 0.09877 14.878 <2e-16 ***
sleep.L -0.35238 0.19797 -1.780 0.0751 .
sleep.Q 0.01835 0.13907 0.132 0.8950
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1067.0 on 1086 degrees of freedom
Residual deviance: 1063.5 on 1084 degrees of freedom
AIC: 1069.5
Number of Fisher Scoring iterations: 4
Oops! It is only by coincidence that we get an output for each of these models. The glm()
function is not applicable to multinomial models unless we do a lot of math on it. Even then, it’s not universally applicable. Much better is to actually construct a multinomial logistic regression model. We did that last time with the multinom()
function, but the output was hard to interpret. Let’s use the tidy()
function to make it easier to read.
::p_load(nnet)
pacman::p_load(broom)
pacman::p_load(kableExtra)
pacman<- multinom(sleep ~ profession,data=df) m
# weights: 18 (10 variable)
initial value 1194.191558
iter 10 value 964.457953
final value 961.288627
converged
tidy(m, conf.int = TRUE) |> kable() |> kable_styling("basic",full_width=FALSE)
y.level | term | estimate | std.error | statistic | p.value | conf.low | conf.high |
---|---|---|---|---|---|---|---|
6-8 | (Intercept) | 1.8306335 | 0.2350556 | 7.7880880 | 0.0000000 | 1.3699331 | 2.2913340 |
6-8 | professioncontrol | -0.1233365 | 0.2983337 | -0.4134181 | 0.6793003 | -0.7080598 | 0.4613867 |
6-8 | professionpilots | 0.1076454 | 0.3397862 | 0.3168033 | 0.7513929 | -0.5583234 | 0.7736142 |
6-8 | professiontrain operators | -0.4188257 | 0.3132678 | -1.3369574 | 0.1812366 | -1.0328192 | 0.1951679 |
6-8 | professiontruck drivers | -0.6238287 | 0.3039230 | -2.0525879 | 0.0401126 | -1.2195069 | -0.0281505 |
>8 | (Intercept) | 1.0158943 | 0.2546738 | 3.9890023 | 0.0000664 | 0.5167429 | 1.5150458 |
>8 | professioncontrol | -0.4123800 | 0.3302332 | -1.2487539 | 0.2117551 | -1.0596252 | 0.2348652 |
>8 | professionpilots | -0.0285802 | 0.3702625 | -0.0771890 | 0.9384732 | -0.7542813 | 0.6971209 |
>8 | professiontrain operators | -0.9174745 | 0.3613738 | -2.5388517 | 0.0111217 | -1.6257542 | -0.2091949 |
>8 | professiontruck drivers | -0.6394257 | 0.3362105 | -1.9018613 | 0.0571893 | -1.2983862 | 0.0195347 |
The kable
stuff is because, by default, the tibble output format of tidy()
only shows the first ten letters of a chr
column, which is not enough to distinguish between them, given that they are all prefixed with the word “profession.” The kableExtra
package provides the tools to construct a better-looking html table.
$sleep <- ordered(df$sleep, levels = c(">8","6-8","<6"))
df<- multinom(sleep ~ profession,data=df) m
# weights: 18 (10 variable)
initial value 1194.191558
iter 10 value 963.528061
final value 961.288627
converged
tidy(m, conf.int = TRUE) |> kable() |> kable_styling("basic",full_width=FALSE)
y.level | term | estimate | std.error | statistic | p.value | conf.low | conf.high |
---|---|---|---|---|---|---|---|
6-8 | (Intercept) | 0.8147439 | 0.1577176 | 5.1658408 | 0.0000002 | 0.5056231 | 1.1238646 |
6-8 | professioncontrol | 0.2890608 | 0.2137315 | 1.3524486 | 0.1762318 | -0.1298451 | 0.7079668 |
6-8 | professionpilots | 0.1362254 | 0.2281630 | 0.5970531 | 0.5504719 | -0.3109658 | 0.5834166 |
6-8 | professiontrain operators | 0.4986388 | 0.2540238 | 1.9629612 | 0.0496507 | 0.0007614 | 0.9965162 |
6-8 | professiontruck drivers | 0.0156043 | 0.2302817 | 0.0677617 | 0.9459753 | -0.4357396 | 0.4669481 |
<6 | (Intercept) | -1.0159415 | 0.2546780 | -3.9891222 | 0.0000663 | -1.5151012 | -0.5167819 |
<6 | professioncontrol | 0.4123970 | 0.3302385 | 1.2487853 | 0.2117436 | -0.2348586 | 1.0596525 |
<6 | professionpilots | 0.0285458 | 0.3702707 | 0.0770943 | 0.9385485 | -0.6971714 | 0.7542630 |
<6 | professiontrain operators | 0.9174965 | 0.3613773 | 2.5388883 | 0.0111205 | 0.2092101 | 1.6257829 |
<6 | professiontruck drivers | 0.6394607 | 0.3362145 | 1.9019426 | 0.0571787 | -0.0195076 | 1.2984290 |
Murders (not recommended for analysis)
load("murders.rda")
ls()
[1] "data" "data_ord" "df"
[4] "fatal_police_shootings" "m" "mammals"
[7] "murders" "my_colors" "ord"
[10] "resume" "sleep_deprivation" "tbl"
[13] "uniqdashc" "wideScreen" "x"
str(murders)
tibble [20 × 4] (S3: tbl_df/tbl/data.frame)
$ population : int [1:20] 587000 643000 635000 692000 1248000 643000 1964000 1531000 713000 749000 ...
$ perc_pov : num [1:20] 16.5 20.5 26.3 16.5 19.2 16.5 20.2 21.3 17.2 14.3 ...
$ perc_unemp : num [1:20] 6.2 6.4 9.3 5.3 7.3 5.9 6.4 7.6 4.9 6.4 ...
$ annual_murders_per_mil: num [1:20] 11.2 13.4 40.7 5.3 24.8 12.7 20.9 35.7 8.7 9.6 ...
<- murders
df <- lm(annual_murders_per_mil ~ .,data=df)
m summary(m)
Call:
lm(formula = annual_murders_per_mil ~ ., data = df)
Residuals:
Min 1Q Median 3Q Max
-5.7174 -3.3233 0.4031 1.7684 10.0329
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -3.676e+01 7.011e+00 -5.244 8.03e-05 ***
population 7.629e-07 6.363e-07 1.199 0.24798
perc_pov 1.192e+00 5.617e-01 2.123 0.04974 *
perc_unemp 4.720e+00 1.530e+00 3.084 0.00712 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 4.59 on 16 degrees of freedom
Multiple R-squared: 0.8183, Adjusted R-squared: 0.7843
F-statistic: 24.02 on 3 and 16 DF, p-value: 3.629e-06
When people are unemployed, their thoughts turn to murder. Wait, wut?
NBA players 2018–2019
load("nba_players_19.rda")
ls()
[1] "data" "data_ord" "df"
[4] "fatal_police_shootings" "m" "mammals"
[7] "murders" "my_colors" "nba_players_19"
[10] "ord" "resume" "sleep_deprivation"
[13] "tbl" "uniqdashc" "wideScreen"
[16] "x"
str(nba_players_19)
tibble [494 × 7] (S3: tbl_df/tbl/data.frame)
$ first_name: chr [1:494] "Alex" "Jaylen" "Steven" "Bam" ...
$ last_name : chr [1:494] "Abrines" "Adams" "Adams" "Adebayo" ...
$ team : chr [1:494] "Thunder" "Hawks" "Thunder" "Heat" ...
$ team_abbr : chr [1:494] "OKC" "ATL" "OKC" "MIA" ...
$ position : chr [1:494] "Guard" "Guard" "Center" "Center-Forward" ...
$ number : chr [1:494] "8" "10" "12" "13" ...
$ height : num [1:494] 78 74 84 82 78 83 77 77 83 81 ...
<- nba_players_19
df |> ggplot(aes(position,height)) +
df geom_boxplot() +
geom_jitter(alpha=0.2)
Poker (anonymous)
load("poker.rda")
ls()
[1] "data" "data_ord" "df"
[4] "fatal_police_shootings" "m" "mammals"
[7] "murders" "my_colors" "nba_players_19"
[10] "ord" "poker" "resume"
[13] "sleep_deprivation" "tbl" "uniqdashc"
[16] "wideScreen" "x"
str(poker)
tibble [50 × 1] (S3: tbl_df/tbl/data.frame)
$ winnings: int [1:50] -110 -9 -60 316 -200 -196 320 -160 31 331 ...
<- poker
df stem(df$winnings)
The decimal point is 3 digit(s) to the right of the |
-1 | 00
-0 | 9875
-0 | 333222221111100000
0 | 000001111112233333444
0 | 889
1 |
1 | 7
2 |
2 |
3 |
3 | 7
|> ggplot(aes(winnings)) +
df geom_histogram()
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
SAT and GPA data
load("satgpa.rda")
ls()
[1] "data" "data_ord" "df"
[4] "fatal_police_shootings" "m" "mammals"
[7] "murders" "my_colors" "nba_players_19"
[10] "ord" "poker" "resume"
[13] "satgpa" "sleep_deprivation" "tbl"
[16] "uniqdashc" "wideScreen" "x"
str(satgpa)
tibble [1,000 × 6] (S3: tbl_df/tbl/data.frame)
$ sex : int [1:1000] 1 2 2 1 1 2 1 1 2 1 ...
$ sat_v : int [1:1000] 65 58 56 42 55 55 57 53 67 41 ...
$ sat_m : int [1:1000] 62 64 60 53 52 56 65 62 77 44 ...
$ sat_sum: int [1:1000] 127 122 116 95 107 111 122 115 144 85 ...
$ hs_gpa : num [1:1000] 3.4 4 3.75 3.75 4 4 2.8 3.8 4 2.6 ...
$ fy_gpa : num [1:1000] 3.18 3.33 3.25 2.42 2.63 2.91 2.83 2.51 3.82 2.54 ...
<- satgpa
df # m <- glm(sex ~ .,data=df,family="binomial")
table(df$sex)
1 2
516 484
$sex <- ifelse(df$sex == 2,0,1)
df<- glm(sex ~ .,data=df,family="binomial")
m summary(m)
Call:
glm(formula = sex ~ ., family = "binomial", data = df)
Coefficients: (1 not defined because of singularities)
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.600177 0.553805 -2.889 0.003859 **
sat_v -0.004676 0.009844 -0.475 0.634824
sat_m 0.107299 0.010710 10.019 < 2e-16 ***
sat_sum NA NA NA NA
hs_gpa -0.920906 0.161527 -5.701 1.19e-08 ***
fy_gpa -0.400117 0.117156 -3.415 0.000637 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1385.3 on 999 degrees of freedom
Residual deviance: 1233.7 on 995 degrees of freedom
AIC: 1243.7
Number of Fisher Scoring iterations: 4
Who knows which sex is which? Anyway.
<- lm(sat_m ~ hs_gpa,data=df)
m summary(m)
Call:
lm(formula = sat_m ~ hs_gpa, data = df)
Residuals:
Min 1Q Median 3Q Max
-25.0738 -5.2266 0.2412 5.0743 22.7225
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 35.5320 1.4829 23.96 <2e-16 ***
hs_gpa 5.8982 0.4572 12.90 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 7.827 on 998 degrees of freedom
Multiple R-squared: 0.1429, Adjusted R-squared: 0.1421
F-statistic: 166.4 on 1 and 998 DF, p-value: < 2.2e-16
<- lm(fy_gpa ~ sat_m,data=df)
m summary(m)
Call:
lm(formula = fy_gpa ~ sat_m, data = df)
Residuals:
Min 1Q Median 3Q Max
-2.28486 -0.44666 0.00499 0.49423 1.68271
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.621899 0.140849 4.415 1.12e-05 ***
sat_m 0.033938 0.002559 13.264 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.6834 on 998 degrees of freedom
Multiple R-squared: 0.1499, Adjusted R-squared: 0.149
F-statistic: 175.9 on 1 and 998 DF, p-value: < 2.2e-16