More R

pacman::p_load(tidyverse)

Homework

x <- c(0,3.7,0,3.7,3.9,3.8,3.8,3.8,3.8,3.7,3.8)
stem(x)


  The decimal point is at the |

  0 | 00
  1 | 
  2 | 
  3 | 777888889

Demo with more data sets

Fatal Police Shootings

load("fatal_police_shootings.rda")
ls()

[1] "fatal_police_shootings" "uniqdashc"              "wideScreen"            
[4] "x"

str(fatal_police_shootings)

'data.frame':   6421 obs. of  12 variables:
 $ date                   : chr  "2015-01-02" "2015-01-02" "2015-01-03" "2015-01-04" ...
 $ manner_of_death        : chr  "shot" "shot" "shot and Tasered" "shot" ...
 $ armed                  : chr  "gun" "gun" "unarmed" "toy weapon" ...
 $ age                    : int  53 47 23 32 39 18 22 35 34 47 ...
 $ gender                 : chr  "M" "M" "M" "M" ...
 $ race                   : chr  "A" "W" "H" "W" ...
 $ city                   : chr  "Shelton" "Aloha" "Wichita" "San Francisco" ...
 $ state                  : chr  "WA" "OR" "KS" "CA" ...
 $ signs_of_mental_illness: chr  "True" "False" "False" "True" ...
 $ threat_level           : chr  "attack" "attack" "other" "attack" ...
 $ flee                   : chr  "Not fleeing" "Not fleeing" "Not fleeing" "Not fleeing" ...
 $ body_camera            : chr  "False" "False" "False" "False" ...

df <- fatal_police_shootings
df$date <- ymd(df$date)
str(df)

'data.frame':   6421 obs. of  12 variables:
 $ date                   : Date, format: "2015-01-02" "2015-01-02" ...
 $ manner_of_death        : chr  "shot" "shot" "shot and Tasered" "shot" ...
 $ armed                  : chr  "gun" "gun" "unarmed" "toy weapon" ...
 $ age                    : int  53 47 23 32 39 18 22 35 34 47 ...
 $ gender                 : chr  "M" "M" "M" "M" ...
 $ race                   : chr  "A" "W" "H" "W" ...
 $ city                   : chr  "Shelton" "Aloha" "Wichita" "San Francisco" ...
 $ state                  : chr  "WA" "OR" "KS" "CA" ...
 $ signs_of_mental_illness: chr  "True" "False" "False" "True" ...
 $ threat_level           : chr  "attack" "attack" "other" "attack" ...
 $ flee                   : chr  "Not fleeing" "Not fleeing" "Not fleeing" "Not fleeing" ...
 $ body_camera            : chr  "False" "False" "False" "False" ...

summary(df$date)

        Min.      1st Qu.       Median         Mean      3rd Qu.         Max. 
"2015-01-02" "2016-08-16" "2018-03-27" "2018-04-02" "2019-11-29" "2021-07-04"

# stem(df$date)
stem(as.integer(df$date))


  The decimal point is 2 digit(s) to the right of the |

  164 | 44444444444444444444455555555555555555555555566666666666666666666666+73
  165 | 00000000000000000000000000111111111111111111111111111111111112222222+187
  166 | 00000000000000000000001111111111111111111112222222222222222222222222+206
  167 | 00000000000000000000000000000001111111111111111111122222222222222222+194
  168 | 00000000000000000000111111111111111111111112222222222222222222222222+199
  169 | 00000000000000000000000011111111111111111111111112222222222222222222+181
  170 | 00000000000000001111111111111111111111111111112222222222222222222222+172
  171 | 00000000000000000000000111111111111111111111111111122222222222222222+185
  172 | 00000000000000000000000000000000011111111111111111111111111111111111+186
  173 | 00000000000000000000000000000111111111111111111111112222222222222222+198
  174 | 00000000000000000000000000011111111111111111111111111222222222222222+186
  175 | 00000000000000000001111111111111111111111222222222222222222222222222+202
  176 | 00000000000000000000000000000000000000001111111111111111111111111111+228
  177 | 00000000000000000000000000000011111111111111111111111112222222222222+169
  178 | 00000000000000000000111111111111111111111111122222222222222222222222+155
  179 | 00000000000000000000000000000000000001111111111111111111111111111111+197
  180 | 00000000000000000000000000000011111111111111112222222222222222222222+185
  181 | 00000000000000000001111111111111111111111111111111111222222222222222+185
  182 | 00000000000000000000000000000111111111111111111111111111222222222222+213
  183 | 00000000000000000000000111111111111111111111111122222222222222222222+207
  184 | 00000000000000000000000000011111111111111111111111111111111111111111+194
  185 | 00000000000000000000001111111111111111111111222222222222222222222233+192
  186 | 00000000000000000000000000011111111111111111111111111111111222222222+188
  187 | 00000000000000000000000000001111111111111111112222222222222222222222+169
  188 | 00000000000000000000000000000000000111111111111111

str(df)

'data.frame':   6421 obs. of  12 variables:
 $ date                   : Date, format: "2015-01-02" "2015-01-02" ...
 $ manner_of_death        : chr  "shot" "shot" "shot and Tasered" "shot" ...
 $ armed                  : chr  "gun" "gun" "unarmed" "toy weapon" ...
 $ age                    : int  53 47 23 32 39 18 22 35 34 47 ...
 $ gender                 : chr  "M" "M" "M" "M" ...
 $ race                   : chr  "A" "W" "H" "W" ...
 $ city                   : chr  "Shelton" "Aloha" "Wichita" "San Francisco" ...
 $ state                  : chr  "WA" "OR" "KS" "CA" ...
 $ signs_of_mental_illness: chr  "True" "False" "False" "True" ...
 $ threat_level           : chr  "attack" "attack" "other" "attack" ...
 $ flee                   : chr  "Not fleeing" "Not fleeing" "Not fleeing" "Not fleeing" ...
 $ body_camera            : chr  "False" "False" "False" "False" ...

df$fleeing <- ifelse(df$flee=="Not fleeing",0,1)
m <- glm(fleeing ~ age,data=df,family="binomial")
summary(m)


Call:
glm(formula = fleeing ~ age, family = "binomial", data = df)

Coefficients:
             Estimate Std. Error z value Pr(>|z|)    
(Intercept)  0.746484   0.083052   8.988   <2e-16 ***
age         -0.032564   0.002205 -14.771   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 8217.5  on 6135  degrees of freedom
Residual deviance: 7981.4  on 6134  degrees of freedom
  (285 observations deleted due to missingness)
AIC: 7985.4

Number of Fisher Scoring iterations: 4

table(df$race)


        A    B    H    N    O    W 
 666  105 1528 1066   90   47 2919

df |> ggplot(aes(race,age)) +
  geom_boxplot() +
  geom_jitter(alpha=0.1)

Warning: Removed 285 rows containing non-finite outside the scale range
(`stat_boxplot()`).

Warning: Removed 285 rows containing missing values or values outside the scale range
(`geom_point()`).

df |> count(armed) |> arrange(desc(n))

                              armed    n
1                               gun 3699
2                             knife  936
3                           unarmed  413
4                        toy weapon  219
5                                    208
6                           vehicle  204
7                      undetermined  173
8                    unknown weapon   77
9                           machete   49
10                            Taser   31
11                               ax   24
12                            sword   23
13                    gun and knife   21
14                     baseball bat   20
15                           hammer   18
16                  gun and vehicle   16
17                       metal pipe   16
18                      screwdriver   16
19                          hatchet   14
20                           BB gun   13
21                       box cutter   13
22                     sharp object   13
23                      gun and car   11
24                         crossbow    9
25                         scissors    9
26                    piece of wood    7
27                             pipe    7
28                             rock    7
29                           shovel    7
30                  vehicle and gun    7
31                            baton    6
32                     meat cleaver    6
33                     blunt object    5
34                          crowbar    5
35                     metal object    5
36              straight edge razor    5
37                            chair    4
38                      glass shard    4
39                       metal pole    4
40                         pick-axe    4
41                    samurai sword    4
42                        tire iron    4
43                   Airsoft pistol    3
44                      beer bottle    3
45                            chain    3
46                  gun and machete    3
47              guns and explosives    3
48                      metal stick    3
49                       pellet gun    3
50                             pole    3
51                            brick    2
52                        chain saw    2
53                       flashlight    2
54                      garden tool    2
55                  hatchet and gun    2
56                incendiary device    2
57                 lawn mower blade    2
58                  metal hand tool    2
59                     pepper spray    2
60                        pitchfork    2
61                   pole and knife    2
62                            spear    2
63               BB gun and vehicle    1
64                  air conditioner    1
65                       air pistol    1
66                         barstool    1
67          baseball bat and bottle    1
68 baseball bat and fireplace poker    1
69           baseball bat and knife    1
70                     bean-bag gun    1
71                       binoculars    1
72                           bottle    1
73                    bow and arrow    1
74              car, knife and mace    1
75                          carjack    1
76                         chainsaw    1
77              claimed to be armed    1
78               contractor's level    1
79                   cordless drill    1
80                        fireworks    1
81                         flagpole    1
82                          grenade    1
83                    gun and sword    1
84                       hand torch    1
85                         ice pick    1
86                knife and vehicle    1
87                  machete and gun    1
88                       metal rake    1
89                       microphone    1
90                       motorcycle    1
91                         nail gun    1
92                              oar    1
93                              pen    1
94                  railroad spikes    1
95                          stapler    1
96              vehicle and machete    1
97                    walking stick    1
98                       wasp spray    1
99                           wrench    1

Is there a difference between threat levels if a body camera is turned on?

tbl<-table(df$threat_level,df$body_camera)
chisq.test(tbl)


    Pearson's Chi-squared test

data:  tbl
X-squared = 10.081, df = 2, p-value = 0.00647

Sleep in mammals

load("mammals.rda")
ls()

[1] "df"                     "fatal_police_shootings" "m"                     
[4] "mammals"                "tbl"                    "uniqdashc"             
[7] "wideScreen"             "x"

str(mammals)

tibble [62 × 11] (S3: tbl_df/tbl/data.frame)
 $ species     : Factor w/ 62 levels "Africanelephant",..: 1 2 3 4 5 6 7 8 9 10 ...
 $ body_wt     : num [1:62] 6654 1 3.38 0.92 2547 ...
 $ brain_wt    : num [1:62] 5712 6.6 44.5 5.7 4603 ...
 $ non_dreaming: num [1:62] NA 6.3 NA NA 2.1 9.1 15.8 5.2 10.9 8.3 ...
 $ dreaming    : num [1:62] NA 2 NA NA 1.8 0.7 3.9 1 3.6 1.4 ...
 $ total_sleep : num [1:62] 3.3 8.3 12.5 16.5 3.9 9.8 19.7 6.2 14.5 9.7 ...
 $ life_span   : num [1:62] 38.6 4.5 14 NA 69 27 19 30.4 28 50 ...
 $ gestation   : num [1:62] 645 42 60 25 624 180 35 392 63 230 ...
 $ predation   : int [1:62] 3 3 1 5 3 4 1 4 1 1 ...
 $ exposure    : int [1:62] 5 1 1 2 5 4 1 5 2 1 ...
 $ danger      : int [1:62] 3 3 1 3 4 4 1 4 1 1 ...

df <- mammals
m <- lm(total_sleep ~ brain_wt,data=df)
summary(m)


Call:
lm(formula = total_sleep ~ brain_wt, data = df)

Residuals:
   Min     1Q Median     3Q    Max 
-8.248 -2.610 -0.309  2.189  8.884 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) 11.0163240  0.5941893   18.54  < 2e-16 ***
brain_wt    -0.0017195  0.0005991   -2.87  0.00578 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 4.339 on 56 degrees of freedom
  (4 observations deleted due to missingness)
Multiple R-squared:  0.1282,    Adjusted R-squared:  0.1127 
F-statistic: 8.238 on 1 and 56 DF,  p-value: 0.00578

df |> select(total_sleep,species) |> arrange(total_sleep) |> print(n=62)

# A tibble: 62 × 2
   total_sleep species                
         <dbl> <fct>                  
 1         2.6 Roedeer                
 2         2.9 Horse                  
 3         3.1 Donkey                 
 4         3.3 Africanelephant        
 5         3.8 Goat                   
 6         3.8 Sheep                  
 7         3.9 Asianelephant          
 8         3.9 Cow                    
 9         5.4 Rockhyrax(Procaviahab) 
10         5.4 Treehyrax              
11         6.1 Genet                  
12         6.2 Braziliantapir         
13         6.2 Grayseal               
14         6.6 Rockhyrax(Heterob)     
15         8   Man                    
16         8.2 Guineapig              
17         8.3 Africangiantpouchedrat 
18         8.4 EasternAmericanmole    
19         8.4 Pig                    
20         8.4 Rabbit                 
21         8.6 Echidna                
22         9.1 Lessershort-tailedshrew
23         9.6 Rhesusmonkey           
24         9.7 Chimpanzee             
25         9.8 Baboon                 
26         9.8 Redfox                 
27        10.3 Deserthedgehog         
28        10.3 Starnosedmole          
29        10.3 Vervet                 
30        10.6 Molerat                
31        10.7 Europeanhedgehog       
32        10.7 Galago                 
33        10.8 Jaguar                 
34        10.9 Patasmonkey            
35        11   Slowloris              
36        11.2 Mountainbeaver         
37        12   Gorilla                
38        12.5 ArcticFox              
39        12.5 Chinchilla             
40        12.5 Raccoon                
41        12.8 Muskshrew              
42        13   Graywolf               
43        13.2 Mouse                  
44        13.2 Rat                    
45        13.3 Tenrec                 
46        13.7 Phanlanger             
47        13.8 Groundsquirrel         
48        14.4 Goldenhamster          
49        14.5 Cat                    
50        15.8 Treeshrew              
51        16.5 Arcticgroundsquirrel   
52        17   Owlmonkey              
53        17.4 Nine-bandedarmadillo   
54        18.1 Giantarmadillo         
55        19.4 NAmericanopossum       
56        19.4 Wateropossum           
57        19.7 Bigbrownbat            
58        19.9 Littlebrownbat         
59        NA   Giraffe                
60        NA   Kangaroo               
61        NA   Okapi                  
62        NA   Yellow-belliedmarmot

pacman::p_load(GGally)
df |> select(-species) |> ggpairs()

Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 14 rows containing missing values

Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 12 rows containing missing values

Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 4 rows containing missing values

Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 4 rows containing missing values

Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 4 rows containing missing values

Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 14 rows containing missing values

Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 12 rows containing missing values

Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 4 rows containing missing values

Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 4 rows containing missing values

Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 4 rows containing missing values

Warning: Removed 14 rows containing missing values or values outside the scale range
(`geom_point()`).
Removed 14 rows containing missing values or values outside the scale range
(`geom_point()`).

Warning: Removed 14 rows containing non-finite outside the scale range
(`stat_density()`).

Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 14 rows containing missing values

Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 14 rows containing missing values

Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 17 rows containing missing values

Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 18 rows containing missing values

Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 14 rows containing missing values

Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 14 rows containing missing values

Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 14 rows containing missing values

Warning: Removed 12 rows containing missing values or values outside the scale range
(`geom_point()`).
Removed 12 rows containing missing values or values outside the scale range
(`geom_point()`).

Warning: Removed 14 rows containing missing values or values outside the scale range
(`geom_point()`).

Warning: Removed 12 rows containing non-finite outside the scale range
(`stat_density()`).

Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 14 rows containing missing values

Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 15 rows containing missing values

Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 16 rows containing missing values

Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 12 rows containing missing values

Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 12 rows containing missing values

Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 12 rows containing missing values

Warning: Removed 4 rows containing missing values or values outside the scale range
(`geom_point()`).
Removed 4 rows containing missing values or values outside the scale range
(`geom_point()`).

Warning: Removed 14 rows containing missing values or values outside the scale range
(`geom_point()`).
Removed 14 rows containing missing values or values outside the scale range
(`geom_point()`).

Warning: Removed 4 rows containing non-finite outside the scale range
(`stat_density()`).

Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 8 rows containing missing values

Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 8 rows containing missing values

Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 4 rows containing missing values

Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 4 rows containing missing values

Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 4 rows containing missing values

Warning: Removed 4 rows containing missing values or values outside the scale range
(`geom_point()`).
Removed 4 rows containing missing values or values outside the scale range
(`geom_point()`).

Warning: Removed 17 rows containing missing values or values outside the scale range
(`geom_point()`).

Warning: Removed 15 rows containing missing values or values outside the scale range
(`geom_point()`).

Warning: Removed 8 rows containing missing values or values outside the scale range
(`geom_point()`).

Warning: Removed 4 rows containing non-finite outside the scale range
(`stat_density()`).

Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 7 rows containing missing values

Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 4 rows containing missing values

Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 4 rows containing missing values

Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 4 rows containing missing values

Warning: Removed 4 rows containing missing values or values outside the scale range
(`geom_point()`).
Removed 4 rows containing missing values or values outside the scale range
(`geom_point()`).

Warning: Removed 18 rows containing missing values or values outside the scale range
(`geom_point()`).

Warning: Removed 16 rows containing missing values or values outside the scale range
(`geom_point()`).

Warning: Removed 8 rows containing missing values or values outside the scale range
(`geom_point()`).

Warning: Removed 7 rows containing missing values or values outside the scale range
(`geom_point()`).

Warning: Removed 4 rows containing non-finite outside the scale range
(`stat_density()`).

Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 4 rows containing missing values

Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 4 rows containing missing values

Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 4 rows containing missing values

Warning: Removed 14 rows containing missing values or values outside the scale range
(`geom_point()`).

Warning: Removed 12 rows containing missing values or values outside the scale range
(`geom_point()`).

Warning: Removed 4 rows containing missing values or values outside the scale range
(`geom_point()`).
Removed 4 rows containing missing values or values outside the scale range
(`geom_point()`).
Removed 4 rows containing missing values or values outside the scale range
(`geom_point()`).

Warning: Removed 14 rows containing missing values or values outside the scale range
(`geom_point()`).

Warning: Removed 12 rows containing missing values or values outside the scale range
(`geom_point()`).

Warning: Removed 4 rows containing missing values or values outside the scale range
(`geom_point()`).
Removed 4 rows containing missing values or values outside the scale range
(`geom_point()`).
Removed 4 rows containing missing values or values outside the scale range
(`geom_point()`).

Warning: Removed 14 rows containing missing values or values outside the scale range
(`geom_point()`).

Warning: Removed 12 rows containing missing values or values outside the scale range
(`geom_point()`).

Warning: Removed 4 rows containing missing values or values outside the scale range
(`geom_point()`).
Removed 4 rows containing missing values or values outside the scale range
(`geom_point()`).
Removed 4 rows containing missing values or values outside the scale range
(`geom_point()`).

df |> select(-species,-gestation,-exposure,-danger,-body_wt,-predation) |> ggpairs()

Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 14 rows containing missing values

Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 12 rows containing missing values

Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 4 rows containing missing values

Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 4 rows containing missing values

Warning: Removed 14 rows containing missing values or values outside the scale range
(`geom_point()`).

Warning: Removed 14 rows containing non-finite outside the scale range
(`stat_density()`).

Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 14 rows containing missing values

Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 14 rows containing missing values

Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 17 rows containing missing values

Warning: Removed 12 rows containing missing values or values outside the scale range
(`geom_point()`).

Warning: Removed 14 rows containing missing values or values outside the scale range
(`geom_point()`).

Warning: Removed 12 rows containing non-finite outside the scale range
(`stat_density()`).

Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 14 rows containing missing values

Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 15 rows containing missing values

Warning: Removed 4 rows containing missing values or values outside the scale range
(`geom_point()`).

Warning: Removed 14 rows containing missing values or values outside the scale range
(`geom_point()`).
Removed 14 rows containing missing values or values outside the scale range
(`geom_point()`).

Warning: Removed 4 rows containing non-finite outside the scale range
(`stat_density()`).

Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
Removed 8 rows containing missing values

Warning: Removed 4 rows containing missing values or values outside the scale range
(`geom_point()`).

Warning: Removed 17 rows containing missing values or values outside the scale range
(`geom_point()`).

Warning: Removed 15 rows containing missing values or values outside the scale range
(`geom_point()`).

Warning: Removed 8 rows containing missing values or values outside the scale range
(`geom_point()`).

Warning: Removed 4 rows containing non-finite outside the scale range
(`stat_density()`).

pacman::p_load(corrgram)
df |> select(-species,-gestation,-exposure,-danger,-body_wt,-predation) |> corrgram(upper.panel=panel.pie)

There are loads of other options for scatterplot matrices, known in the R Graph Gallery as correlograms.

pacman::p_load(ellipse)
pacman::p_load(RColorBrewer)
data <- cor(df[,2:7])
my_colors <- brewer.pal(5,"Spectral")
my_colors <- colorRampPalette(my_colors)(100)
ord <- order(data[1,])
data_ord <- data[ord,ord]
plotcorr(data_ord, col=my_colors[data_ord*50+50], mar=c(1,1,1,1) )

This didn’t work well, so my first guess is that I misspelled something. To check that, I’ll use the above code to reproduce the example in the R Graph Gallery.

data <- cor(mtcars)
my_colors <- brewer.pal(5,"Spectral")
my_colors <- colorRampPalette(my_colors)(100)
ord <- order(data[1,])
data_ord <- data[ord,ord]
plotcorr(data_ord, col=my_colors[data_ord*50+50], mar=c(1,1,1,1) )

This plots correctly, so there must be something else wrong. The next step is to find out what ord and data_ord are.

ord

 [1]  6  2  3  4 11  7 10  9  8  5  1

data_ord

             wt        cyl       disp         hp        carb        qsec
wt    1.0000000  0.7824958  0.8879799  0.6587479  0.42760594 -0.17471588
cyl   0.7824958  1.0000000  0.9020329  0.8324475  0.52698829 -0.59124207
disp  0.8879799  0.9020329  1.0000000  0.7909486  0.39497686 -0.43369788
hp    0.6587479  0.8324475  0.7909486  1.0000000  0.74981247 -0.70822339
carb  0.4276059  0.5269883  0.3949769  0.7498125  1.00000000 -0.65624923
qsec -0.1747159 -0.5912421 -0.4336979 -0.7082234 -0.65624923  1.00000000
gear -0.5832870 -0.4926866 -0.5555692 -0.1257043  0.27407284 -0.21268223
am   -0.6924953 -0.5226070 -0.5912270 -0.2432043  0.05753435 -0.22986086
vs   -0.5549157 -0.8108118 -0.7104159 -0.7230967 -0.56960714  0.74453544
drat -0.7124406 -0.6999381 -0.7102139 -0.4487591 -0.09078980  0.09120476
mpg  -0.8676594 -0.8521620 -0.8475514 -0.7761684 -0.55092507  0.41868403
           gear          am         vs        drat        mpg
wt   -0.5832870 -0.69249526 -0.5549157 -0.71244065 -0.8676594
cyl  -0.4926866 -0.52260705 -0.8108118 -0.69993811 -0.8521620
disp -0.5555692 -0.59122704 -0.7104159 -0.71021393 -0.8475514
hp   -0.1257043 -0.24320426 -0.7230967 -0.44875912 -0.7761684
carb  0.2740728  0.05753435 -0.5696071 -0.09078980 -0.5509251
qsec -0.2126822 -0.22986086  0.7445354  0.09120476  0.4186840
gear  1.0000000  0.79405876  0.2060233  0.69961013  0.4802848
am    0.7940588  1.00000000  0.1683451  0.71271113  0.5998324
vs    0.2060233  0.16834512  1.0000000  0.44027846  0.6640389
drat  0.6996101  0.71271113  0.4402785  1.00000000  0.6811719
mpg   0.4802848  0.59983243  0.6640389  0.68117191  1.0000000

Now I see the problem! Everything but but brain_wt and body_wt has NA values (three such values, I think).

# help(cor)
data <- cor(df[,2:7],use="complete.obs")
my_colors <- brewer.pal(5,"Spectral")
my_colors <- colorRampPalette(my_colors)(100)
ord <- order(data[1,])
data_ord <- data[ord,ord]
plotcorr(data_ord, col=my_colors[data_ord*50+50], mar=c(1,1,1,1) )

Employment Resumes

load("resume.rda")
ls()

 [1] "data"                   "data_ord"               "df"                    
 [4] "fatal_police_shootings" "m"                      "mammals"               
 [7] "my_colors"              "ord"                    "resume"                
[10] "tbl"                    "uniqdashc"              "wideScreen"            
[13] "x"

str(resume)

tibble [4,870 × 30] (S3: tbl_df/tbl/data.frame)
 $ job_ad_id             : num [1:4870] 384 384 384 384 385 386 386 385 386 386 ...
 $ job_city              : chr [1:4870] "Chicago" "Chicago" "Chicago" "Chicago" ...
 $ job_industry          : chr [1:4870] "manufacturing" "manufacturing" "manufacturing" "manufacturing" ...
 $ job_type              : chr [1:4870] "supervisor" "supervisor" "supervisor" "supervisor" ...
 $ job_fed_contractor    : num [1:4870] NA NA NA NA 0 0 0 0 0 0 ...
 $ job_equal_opp_employer: num [1:4870] 1 1 1 1 1 1 1 1 1 1 ...
 $ job_ownership         : chr [1:4870] "unknown" "unknown" "unknown" "unknown" ...
 $ job_req_any           : num [1:4870] 1 1 1 1 1 0 0 1 0 0 ...
 $ job_req_communication : num [1:4870] 0 0 0 0 0 0 0 0 0 0 ...
 $ job_req_education     : num [1:4870] 0 0 0 0 0 0 0 0 0 0 ...
 $ job_req_min_experience: chr [1:4870] "5" "5" "5" "5" ...
 $ job_req_computer      : num [1:4870] 1 1 1 1 1 0 0 1 0 0 ...
 $ job_req_organization  : num [1:4870] 0 0 0 0 1 0 0 1 0 0 ...
 $ job_req_school        : chr [1:4870] "none_listed" "none_listed" "none_listed" "none_listed" ...
 $ received_callback     : num [1:4870] 0 0 0 0 0 0 0 0 0 0 ...
 $ firstname             : chr [1:4870] "Allison" "Kristen" "Lakisha" "Latonya" ...
 $ race                  : chr [1:4870] "white" "white" "black" "black" ...
 $ gender                : chr [1:4870] "f" "f" "f" "f" ...
 $ years_college         : int [1:4870] 4 3 4 3 3 4 4 3 4 4 ...
 $ college_degree        : num [1:4870] 1 0 1 0 0 1 1 0 1 1 ...
 $ honors                : int [1:4870] 0 0 0 0 0 1 0 0 0 0 ...
 $ worked_during_school  : int [1:4870] 0 1 1 0 1 0 1 0 0 1 ...
 $ years_experience      : int [1:4870] 6 6 6 6 22 6 5 21 3 6 ...
 $ computer_skills       : int [1:4870] 1 1 1 1 1 0 1 1 1 0 ...
 $ special_skills        : int [1:4870] 0 0 0 1 0 1 1 1 1 1 ...
 $ volunteer             : int [1:4870] 0 1 0 1 0 0 1 1 0 1 ...
 $ military              : int [1:4870] 0 1 0 0 0 0 0 0 0 0 ...
 $ employment_holes      : int [1:4870] 1 0 0 1 0 0 0 1 0 0 ...
 $ has_email_address     : int [1:4870] 0 1 0 1 1 0 1 1 0 1 ...
 $ resume_quality        : chr [1:4870] "low" "high" "low" "high" ...

df <- resume
m <- glm(received_callback ~ .,data=df,family="binomial")
summary(m)


Call:
glm(formula = received_callback ~ ., family = "binomial", data = df)

Coefficients: (3 not defined because of singularities)
                                            Estimate Std. Error z value
(Intercept)                               -4.372e+00  1.086e+00  -4.027
job_ad_id                                  9.499e-04  2.730e-04   3.480
job_cityChicago                           -7.525e-01  2.107e-01  -3.571
job_industryfinance_insurance_real_estate -4.924e-01  3.085e-01  -1.596
job_industrymanufacturing                 -3.764e-01  2.766e-01  -1.361
job_industryother_service                  2.720e-01  2.333e-01   1.166
job_industrytransportation_communication   1.790e-01  3.649e-01   0.490
job_industrywholesale_and_retail_trade    -1.188e-01  2.093e-01  -0.568
job_typemanager                            1.252e-01  3.308e-01   0.378
job_typeretail_sales                       1.692e-01  3.174e-01   0.533
job_typesales_rep                         -1.520e-01  3.474e-01  -0.437
job_typesecretary                         -1.089e-01  2.410e-01  -0.452
job_typesupervisor                         7.871e-02  3.423e-01   0.230
job_fed_contractor                         2.088e-01  2.356e-01   0.886
job_equal_opp_employer                     4.899e-01  1.655e-01   2.961
job_ownershipprivate                       8.911e-01  3.338e-01   2.669
job_ownershippublic                        7.876e-01  3.840e-01   2.051
job_ownershipunknown                       6.311e-01  3.762e-01   1.678
job_req_any                               -1.968e-01  2.542e-01  -0.774
job_req_communication                      4.130e-01  2.282e-01   1.810
job_req_education                         -2.205e-01  3.628e-01  -0.608
job_req_min_experience0                   -1.384e+01  1.189e+03  -0.012
job_req_min_experience0.5                  9.802e-01  1.191e+00   0.823
job_req_min_experience1                   -4.522e-02  4.103e-01  -0.110
job_req_min_experience10                   1.916e-01  1.087e+00   0.176
job_req_min_experience2                    1.477e-01  2.854e-01   0.517
job_req_min_experience3                   -5.928e-01  3.561e-01  -1.665
job_req_min_experience4                    3.798e-01  1.122e+00   0.338
job_req_min_experience5                   -4.773e-01  4.712e-01  -1.013
job_req_min_experience6                   -1.420e+01  1.144e+03  -0.012
job_req_min_experience7                   -1.366e+01  6.685e+02  -0.020
job_req_min_experience8                   -1.428e+01  8.151e+02  -0.018
job_req_min_experiencesome                -4.597e-02  2.186e-01  -0.210
job_req_computer                           1.815e-01  2.164e-01   0.839
job_req_organization                      -1.046e+00  4.596e-01  -2.275
job_req_schoolhigh_school_grad            -1.346e+01  4.006e+02  -0.034
job_req_schoolnone_listed                         NA         NA      NA
job_req_schoolsome_college                 2.346e-01  4.538e-01   0.517
firstnameAllison                           1.458e+00  6.522e-01   2.235
firstnameAnne                              5.766e-01  7.168e-01   0.804
firstnameBrad                              1.779e+00  7.597e-01   2.341
firstnameBrendan                           9.361e-01  8.542e-01   1.096
firstnameBrett                             1.042e+00  8.557e-01   1.218
firstnameCarrie                            1.769e+00  6.589e-01   2.685
firstnameDarnell                          -1.221e-01  1.201e+00  -0.102
firstnameEbony                             1.134e+00  6.729e-01   1.686
firstnameEmily                             5.345e-01  7.152e-01   0.747
firstnameGeoffrey                          4.044e-01  9.533e-01   0.424
firstnameGreg                              1.239e+00  8.688e-01   1.426
firstnameHakim                             1.146e+00  8.632e-01   1.328
firstnameJamal                             4.176e-01  9.491e-01   0.440
firstnameJay                               1.122e+00  8.108e-01   1.383
firstnameJermaine                          1.522e-01  1.201e+00   0.127
firstnameJill                              6.428e-01  7.049e-01   0.912
firstnameKareem                            6.807e-01  9.497e-01   0.717
firstnameKeisha                            2.477e-01  8.420e-01   0.294
firstnameKenya                             1.211e+00  6.729e-01   1.799
firstnameKristen                           1.733e+00  6.508e-01   2.662
firstnameLakisha                           1.476e-02  7.863e-01   0.019
firstnameLatonya                           1.007e+00  6.761e-01   1.490
firstnameLatoya                            1.421e+00  6.600e-01   2.154
firstnameLaurie                            1.109e+00  6.791e-01   1.634
firstnameLeroy                             1.231e+00  8.603e-01   1.431
firstnameMatthew                           1.231e+00  8.074e-01   1.524
firstnameMeredith                          1.580e+00  6.620e-01   2.387
firstnameNeil                              1.134e+00  8.089e-01   1.402
firstnameRasheed                           5.574e-01  9.511e-01   0.586
firstnameSarah                             1.366e+00  6.808e-01   2.006
firstnameTamika                            1.516e-01  7.327e-01   0.207
firstnameTanisha                           8.136e-01  7.052e-01   1.154
firstnameTodd                              8.946e-01  8.577e-01   1.043
firstnameTremayne                          1.028e+00  8.604e-01   1.195
firstnameTyrone                            6.151e-01  8.552e-01   0.719
racewhite                                         NA         NA      NA
genderm                                           NA         NA      NA
years_college                             -1.976e-01  2.111e-01  -0.936
college_degree                             7.236e-01  3.634e-01   1.991
honors                                     5.941e-01  2.515e-01   2.362
worked_during_school                       1.318e-01  1.835e-01   0.718
years_experience                           2.392e-02  1.510e-02   1.584
computer_skills                           -1.998e-01  1.988e-01  -1.005
special_skills                             5.937e-01  1.684e-01   3.524
volunteer                                 -7.074e-01  2.334e-01  -3.030
military                                   8.800e-03  2.943e-01   0.030
employment_holes                           3.166e-01  1.835e-01   1.725
has_email_address                          2.952e-02  3.301e-01   0.089
resume_qualitylow                         -6.183e-01  3.841e-01  -1.610
                                          Pr(>|z|)    
(Intercept)                               5.64e-05 ***
job_ad_id                                 0.000501 ***
job_cityChicago                           0.000356 ***
job_industryfinance_insurance_real_estate 0.110484    
job_industrymanufacturing                 0.173554    
job_industryother_service                 0.243538    
job_industrytransportation_communication  0.623806    
job_industrywholesale_and_retail_trade    0.570186    
job_typemanager                           0.705135    
job_typeretail_sales                      0.593870    
job_typesales_rep                         0.661794    
job_typesecretary                         0.651533    
job_typesupervisor                        0.818154    
job_fed_contractor                        0.375382    
job_equal_opp_employer                    0.003071 ** 
job_ownershipprivate                      0.007604 ** 
job_ownershippublic                       0.040243 *  
job_ownershipunknown                      0.093435 .  
job_req_any                               0.438902    
job_req_communication                     0.070254 .  
job_req_education                         0.543371    
job_req_min_experience0                   0.990712    
job_req_min_experience0.5                 0.410362    
job_req_min_experience1                   0.912249    
job_req_min_experience10                  0.860077    
job_req_min_experience2                   0.604901    
job_req_min_experience3                   0.095977 .  
job_req_min_experience4                   0.734989    
job_req_min_experience5                   0.311109    
job_req_min_experience6                   0.990096    
job_req_min_experience7                   0.983696    
job_req_min_experience8                   0.986023    
job_req_min_experiencesome                0.833459    
job_req_computer                          0.401664    
job_req_organization                      0.022919 *  
job_req_schoolhigh_school_grad            0.973196    
job_req_schoolnone_listed                       NA    
job_req_schoolsome_college                0.605183    
firstnameAllison                          0.025403 *  
firstnameAnne                             0.421132    
firstnameBrad                             0.019230 *  
firstnameBrendan                          0.273127    
firstnameBrett                            0.223260    
firstnameCarrie                           0.007263 ** 
firstnameDarnell                          0.919041    
firstnameEbony                            0.091886 .  
firstnameEmily                            0.454821    
firstnameGeoffrey                         0.671455    
firstnameGreg                             0.153753    
firstnameHakim                            0.184148    
firstnameJamal                            0.659911    
firstnameJay                              0.166518    
firstnameJermaine                         0.899160    
firstnameJill                             0.361803    
firstnameKareem                           0.473528    
firstnameKeisha                           0.768648    
firstnameKenya                            0.071973 .  
firstnameKristen                          0.007762 ** 
firstnameLakisha                          0.985026    
firstnameLatonya                          0.136226    
firstnameLatoya                           0.031272 *  
firstnameLaurie                           0.102348    
firstnameLeroy                            0.152423    
firstnameMatthew                          0.127441    
firstnameMeredith                         0.016999 *  
firstnameNeil                             0.160781    
firstnameRasheed                          0.557814    
firstnameSarah                            0.044888 *  
firstnameTamika                           0.836099    
firstnameTanisha                          0.248610    
firstnameTodd                             0.296926    
firstnameTremayne                         0.232055    
firstnameTyrone                           0.471964    
racewhite                                       NA    
genderm                                         NA    
years_college                             0.349289    
college_degree                            0.046439 *  
honors                                    0.018168 *  
worked_during_school                      0.472695    
years_experience                          0.113115    
computer_skills                           0.314907    
special_skills                            0.000424 ***
volunteer                                 0.002443 ** 
military                                  0.976143    
employment_holes                          0.084553 .  
has_email_address                         0.928739    
resume_qualitylow                         0.107454    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1674.4  on 3101  degrees of freedom
Residual deviance: 1487.3  on 3018  degrees of freedom
  (1768 observations deleted due to missingness)
AIC: 1655.3

Number of Fisher Scoring iterations: 15

m <- glm(received_callback ~ race,data=df,family="binomial")
summary(m)


Call:
glm(formula = received_callback ~ race, family = "binomial", 
    data = df)

Coefficients:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept) -2.67481    0.08251 -32.417  < 2e-16 ***
racewhite    0.43818    0.10732   4.083 4.45e-05 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2726.9  on 4869  degrees of freedom
Residual deviance: 2709.9  on 4868  degrees of freedom
AIC: 2713.9

Number of Fisher Scoring iterations: 5

m <- glm(received_callback ~ gender,data=df,family="binomial")
summary(m)


Call:
glm(formula = received_callback ~ gender, family = "binomial", 
    data = df)

Coefficients:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept) -2.40901    0.05939 -40.562   <2e-16 ***
genderm     -0.12008    0.12859  -0.934     0.35    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2726.9  on 4869  degrees of freedom
Residual deviance: 2726.0  on 4868  degrees of freedom
AIC: 2730

Number of Fisher Scoring iterations: 5

(tbl <- table(df$gender,df$race))

   
    black white
  f  1886  1860
  m   549   575

chisq.test(tbl)


    Pearson's Chi-squared test with Yates' continuity correction

data:  tbl
X-squared = 0.72289, df = 1, p-value = 0.3952

(tbl <- table(df$received_callback,df$race))

   
    black white
  0  2278  2200
  1   157   235

chisq.test(tbl)


    Pearson's Chi-squared test with Yates' continuity correction

data:  tbl
X-squared = 16.449, df = 1, p-value = 4.998e-05

Sleep Deprivation

load("sleep_deprivation.rda")
ls()

 [1] "data"                   "data_ord"               "df"                    
 [4] "fatal_police_shootings" "m"                      "mammals"               
 [7] "my_colors"              "ord"                    "resume"                
[10] "sleep_deprivation"      "tbl"                    "uniqdashc"             
[13] "wideScreen"             "x"

str(sleep_deprivation)

tibble [1,087 × 2] (S3: tbl_df/tbl/data.frame)
 $ sleep     : Factor w/ 3 levels "<6",">8","6-8": 1 1 1 1 1 1 1 1 1 1 ...
 $ profession: Factor w/ 5 levels "bus / taxi / limo drivers",..: 2 2 2 2 2 2 2 2 2 2 ...

df <- sleep_deprivation
table(df$sleep,df$profession)

     
      bus / taxi / limo drivers control pilots train operators truck drivers
  <6                         21      35     19              29            35
  >8                         58      64     51              32            51
  6-8                       131     193    132             119           117

m <- glm(profession ~ sleep,data=df,family="binomial")
summary(m)


Call:
glm(formula = profession ~ sleep, family = "binomial", data = df)

Coefficients:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept)   1.7262     0.2368   7.288 3.14e-13 ***
sleep>8      -0.4983     0.2800  -1.780   0.0751 .  
sleep6-8     -0.2716     0.2559  -1.061   0.2886    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1067.0  on 1086  degrees of freedom
Residual deviance: 1063.5  on 1084  degrees of freedom
AIC: 1069.5

Number of Fisher Scoring iterations: 4

m <- glm(sleep ~ profession,data=df,family="binomial")
summary(m)


Call:
glm(formula = sleep ~ profession, family = "binomial", data = df)

Coefficients:
                          Estimate Std. Error z value Pr(>|z|)    
(Intercept)                2.19722    0.23000   9.553   <2e-16 ***
professioncontrol         -0.20350    0.29217  -0.697   0.4861    
professionpilots           0.06782    0.33314   0.204   0.8387    
professiontrain operators -0.54724    0.30660  -1.785   0.0743 .  
professiontruck drivers   -0.62861    0.29568  -2.126   0.0335 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 831.18  on 1086  degrees of freedom
Residual deviance: 822.22  on 1082  degrees of freedom
AIC: 832.22

Number of Fisher Scoring iterations: 4

df$sleep <- ordered(df$sleep, levels = c("<6","6-8",">8"))
m <- glm(sleep ~ profession,data=df,family="binomial")
summary(m)


Call:
glm(formula = sleep ~ profession, family = "binomial", data = df)

Coefficients:
                          Estimate Std. Error z value Pr(>|z|)    
(Intercept)                2.19722    0.23000   9.553   <2e-16 ***
professioncontrol         -0.20350    0.29217  -0.697   0.4861    
professionpilots           0.06782    0.33314   0.204   0.8387    
professiontrain operators -0.54724    0.30660  -1.785   0.0743 .  
professiontruck drivers   -0.62861    0.29568  -2.126   0.0335 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 831.18  on 1086  degrees of freedom
Residual deviance: 822.22  on 1082  degrees of freedom
AIC: 832.22

Number of Fisher Scoring iterations: 4

m <- glm(profession ~ sleep,data=df,family="binomial")
summary(m)


Call:
glm(formula = profession ~ sleep, family = "binomial", data = df)

Coefficients:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept)  1.46950    0.09877  14.878   <2e-16 ***
sleep.L     -0.35238    0.19797  -1.780   0.0751 .  
sleep.Q      0.01835    0.13907   0.132   0.8950    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1067.0  on 1086  degrees of freedom
Residual deviance: 1063.5  on 1084  degrees of freedom
AIC: 1069.5

Number of Fisher Scoring iterations: 4

Oops! It is only by coincidence that we get an output for each of these models. The glm() function is not applicable to multinomial models unless we do a lot of math on it. Even then, it’s not universally applicable. Much better is to actually construct a multinomial logistic regression model. We did that last time with the multinom() function, but the output was hard to interpret. Let’s use the tidy() function to make it easier to read.

pacman::p_load(nnet)
pacman::p_load(broom)
pacman::p_load(kableExtra)
m <- multinom(sleep ~ profession,data=df)

# weights:  18 (10 variable)
initial  value 1194.191558 
iter  10 value 964.457953
final  value 961.288627 
converged

tidy(m, conf.int = TRUE) |> kable() |> kable_styling("basic",full_width=FALSE)

y.level	term	estimate	std.error	statistic	p.value	conf.low	conf.high
6-8	(Intercept)	1.8306335	0.2350556	7.7880880	0.0000000	1.3699331	2.2913340
6-8	professioncontrol	-0.1233365	0.2983337	-0.4134181	0.6793003	-0.7080598	0.4613867
6-8	professionpilots	0.1076454	0.3397862	0.3168033	0.7513929	-0.5583234	0.7736142
6-8	professiontrain operators	-0.4188257	0.3132678	-1.3369574	0.1812366	-1.0328192	0.1951679
6-8	professiontruck drivers	-0.6238287	0.3039230	-2.0525879	0.0401126	-1.2195069	-0.0281505
>8	(Intercept)	1.0158943	0.2546738	3.9890023	0.0000664	0.5167429	1.5150458
>8	professioncontrol	-0.4123800	0.3302332	-1.2487539	0.2117551	-1.0596252	0.2348652
>8	professionpilots	-0.0285802	0.3702625	-0.0771890	0.9384732	-0.7542813	0.6971209
>8	professiontrain operators	-0.9174745	0.3613738	-2.5388517	0.0111217	-1.6257542	-0.2091949
>8	professiontruck drivers	-0.6394257	0.3362105	-1.9018613	0.0571893	-1.2983862	0.0195347

The kable stuff is because, by default, the tibble output format of tidy() only shows the first ten letters of a chr column, which is not enough to distinguish between them, given that they are all prefixed with the word “profession.” The kableExtra package provides the tools to construct a better-looking html table.

df$sleep <- ordered(df$sleep, levels = c(">8","6-8","<6"))
m <- multinom(sleep ~ profession,data=df)

# weights:  18 (10 variable)
initial  value 1194.191558 
iter  10 value 963.528061
final  value 961.288627 
converged

tidy(m, conf.int = TRUE) |> kable() |> kable_styling("basic",full_width=FALSE)

y.level	term	estimate	std.error	statistic	p.value	conf.low	conf.high
6-8	(Intercept)	0.8147439	0.1577176	5.1658408	0.0000002	0.5056231	1.1238646
6-8	professioncontrol	0.2890608	0.2137315	1.3524486	0.1762318	-0.1298451	0.7079668
6-8	professionpilots	0.1362254	0.2281630	0.5970531	0.5504719	-0.3109658	0.5834166
6-8	professiontrain operators	0.4986388	0.2540238	1.9629612	0.0496507	0.0007614	0.9965162
6-8	professiontruck drivers	0.0156043	0.2302817	0.0677617	0.9459753	-0.4357396	0.4669481
<6	(Intercept)	-1.0159415	0.2546780	-3.9891222	0.0000663	-1.5151012	-0.5167819
<6	professioncontrol	0.4123970	0.3302385	1.2487853	0.2117436	-0.2348586	1.0596525
<6	professionpilots	0.0285458	0.3702707	0.0770943	0.9385485	-0.6971714	0.7542630
<6	professiontrain operators	0.9174965	0.3613773	2.5388883	0.0111205	0.2092101	1.6257829
<6	professiontruck drivers	0.6394607	0.3362145	1.9019426	0.0571787	-0.0195076	1.2984290

Murders (not recommended for analysis)

load("murders.rda")
ls()

 [1] "data"                   "data_ord"               "df"                    
 [4] "fatal_police_shootings" "m"                      "mammals"               
 [7] "murders"                "my_colors"              "ord"                   
[10] "resume"                 "sleep_deprivation"      "tbl"                   
[13] "uniqdashc"              "wideScreen"             "x"

str(murders)

tibble [20 × 4] (S3: tbl_df/tbl/data.frame)
 $ population            : int [1:20] 587000 643000 635000 692000 1248000 643000 1964000 1531000 713000 749000 ...
 $ perc_pov              : num [1:20] 16.5 20.5 26.3 16.5 19.2 16.5 20.2 21.3 17.2 14.3 ...
 $ perc_unemp            : num [1:20] 6.2 6.4 9.3 5.3 7.3 5.9 6.4 7.6 4.9 6.4 ...
 $ annual_murders_per_mil: num [1:20] 11.2 13.4 40.7 5.3 24.8 12.7 20.9 35.7 8.7 9.6 ...

df <- murders
m <- lm(annual_murders_per_mil ~ .,data=df)
summary(m)


Call:
lm(formula = annual_murders_per_mil ~ ., data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-5.7174 -3.3233  0.4031  1.7684 10.0329 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) -3.676e+01  7.011e+00  -5.244 8.03e-05 ***
population   7.629e-07  6.363e-07   1.199  0.24798    
perc_pov     1.192e+00  5.617e-01   2.123  0.04974 *  
perc_unemp   4.720e+00  1.530e+00   3.084  0.00712 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 4.59 on 16 degrees of freedom
Multiple R-squared:  0.8183,    Adjusted R-squared:  0.7843 
F-statistic: 24.02 on 3 and 16 DF,  p-value: 3.629e-06

When people are unemployed, their thoughts turn to murder. Wait, wut?

NBA players 2018–2019

load("nba_players_19.rda")
ls()

 [1] "data"                   "data_ord"               "df"                    
 [4] "fatal_police_shootings" "m"                      "mammals"               
 [7] "murders"                "my_colors"              "nba_players_19"        
[10] "ord"                    "resume"                 "sleep_deprivation"     
[13] "tbl"                    "uniqdashc"              "wideScreen"            
[16] "x"

str(nba_players_19)

tibble [494 × 7] (S3: tbl_df/tbl/data.frame)
 $ first_name: chr [1:494] "Alex" "Jaylen" "Steven" "Bam" ...
 $ last_name : chr [1:494] "Abrines" "Adams" "Adams" "Adebayo" ...
 $ team      : chr [1:494] "Thunder" "Hawks" "Thunder" "Heat" ...
 $ team_abbr : chr [1:494] "OKC" "ATL" "OKC" "MIA" ...
 $ position  : chr [1:494] "Guard" "Guard" "Center" "Center-Forward" ...
 $ number    : chr [1:494] "8" "10" "12" "13" ...
 $ height    : num [1:494] 78 74 84 82 78 83 77 77 83 81 ...

df <- nba_players_19
df |> ggplot(aes(position,height)) +
  geom_boxplot() +
  geom_jitter(alpha=0.2)

Poker (anonymous)

load("poker.rda")
ls()

 [1] "data"                   "data_ord"               "df"                    
 [4] "fatal_police_shootings" "m"                      "mammals"               
 [7] "murders"                "my_colors"              "nba_players_19"        
[10] "ord"                    "poker"                  "resume"                
[13] "sleep_deprivation"      "tbl"                    "uniqdashc"             
[16] "wideScreen"             "x"

str(poker)

tibble [50 × 1] (S3: tbl_df/tbl/data.frame)
 $ winnings: int [1:50] -110 -9 -60 316 -200 -196 320 -160 31 331 ...

df <- poker
stem(df$winnings)


  The decimal point is 3 digit(s) to the right of the |

  -1 | 00
  -0 | 9875
  -0 | 333222221111100000
   0 | 000001111112233333444
   0 | 889
   1 | 
   1 | 7
   2 | 
   2 | 
   3 | 
   3 | 7

df |> ggplot(aes(winnings)) +
  geom_histogram()

`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

SAT and GPA data

load("satgpa.rda")
ls()

 [1] "data"                   "data_ord"               "df"                    
 [4] "fatal_police_shootings" "m"                      "mammals"               
 [7] "murders"                "my_colors"              "nba_players_19"        
[10] "ord"                    "poker"                  "resume"                
[13] "satgpa"                 "sleep_deprivation"      "tbl"                   
[16] "uniqdashc"              "wideScreen"             "x"

str(satgpa)

tibble [1,000 × 6] (S3: tbl_df/tbl/data.frame)
 $ sex    : int [1:1000] 1 2 2 1 1 2 1 1 2 1 ...
 $ sat_v  : int [1:1000] 65 58 56 42 55 55 57 53 67 41 ...
 $ sat_m  : int [1:1000] 62 64 60 53 52 56 65 62 77 44 ...
 $ sat_sum: int [1:1000] 127 122 116 95 107 111 122 115 144 85 ...
 $ hs_gpa : num [1:1000] 3.4 4 3.75 3.75 4 4 2.8 3.8 4 2.6 ...
 $ fy_gpa : num [1:1000] 3.18 3.33 3.25 2.42 2.63 2.91 2.83 2.51 3.82 2.54 ...

df <- satgpa
# m <- glm(sex ~ .,data=df,family="binomial")
table(df$sex)


  1   2 
516 484

df$sex <- ifelse(df$sex == 2,0,1)
m <- glm(sex ~ .,data=df,family="binomial")
summary(m)


Call:
glm(formula = sex ~ ., family = "binomial", data = df)

Coefficients: (1 not defined because of singularities)
             Estimate Std. Error z value Pr(>|z|)    
(Intercept) -1.600177   0.553805  -2.889 0.003859 ** 
sat_v       -0.004676   0.009844  -0.475 0.634824    
sat_m        0.107299   0.010710  10.019  < 2e-16 ***
sat_sum            NA         NA      NA       NA    
hs_gpa      -0.920906   0.161527  -5.701 1.19e-08 ***
fy_gpa      -0.400117   0.117156  -3.415 0.000637 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1385.3  on 999  degrees of freedom
Residual deviance: 1233.7  on 995  degrees of freedom
AIC: 1243.7

Number of Fisher Scoring iterations: 4

Who knows which sex is which? Anyway.

m <- lm(sat_m ~ hs_gpa,data=df)
summary(m)


Call:
lm(formula = sat_m ~ hs_gpa, data = df)

Residuals:
     Min       1Q   Median       3Q      Max 
-25.0738  -5.2266   0.2412   5.0743  22.7225 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  35.5320     1.4829   23.96   <2e-16 ***
hs_gpa        5.8982     0.4572   12.90   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 7.827 on 998 degrees of freedom
Multiple R-squared:  0.1429,    Adjusted R-squared:  0.1421 
F-statistic: 166.4 on 1 and 998 DF,  p-value: < 2.2e-16

m <- lm(fy_gpa ~ sat_m,data=df)
summary(m)


Call:
lm(formula = fy_gpa ~ sat_m, data = df)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.28486 -0.44666  0.00499  0.49423  1.68271 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 0.621899   0.140849   4.415 1.12e-05 ***
sat_m       0.033938   0.002559  13.264  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.6834 on 998 degrees of freedom
Multiple R-squared:  0.1499,    Adjusted R-squared:  0.149 
F-statistic: 175.9 on 1 and 998 DF,  p-value: < 2.2e-16