Debunking the CT State Economic Index

 ggplot(hdi) +
      geom_segment( aes(x=reorder(state, hdi[,7]), xend=state, y=hdi[,7], yend=hdi[,6]), color="grey") +
      geom_point( aes(x=reorder(state, hdi[,7]), y=hdi[,7]), color="darkred", size=3 ) +
      geom_point( aes(x=reorder(state, hdi[,7]), y=hdi[,6]), color="darkblue", size=3 ) +
      ggplot2::annotate("text", x = 30, y = 115, 
               label = "2023", 
               color = "darkred", size = 5, hjust = -0.1, vjust = .75) +
      ggplot2::annotate("text", x = 42, y = 100, label = "2022", 
               color = "darkblue", size = 5, hjust = -0.1, vjust = -.1) +
      coord_flip()+
      theme_minimal() +
      theme(legend.position = "none") +
      xlab("") +
      ylab("Economic Performance Index, 2019 == 100") +
      labs(caption = "Source: CT Dept of Labor, CT Dept Economic & Community Development ") +
      ggtitle("Economic Performance Index, 2022:2023") +
      theme(
        legend.position = "none",
        #text = element_text(family = "Georgia"),
        axis.text.y = element_text(size = 10),
        plot.title = element_text(size = 16, 
                                  margin = ggplot2::margin(b = 10), 
                                  hjust = 0),
        plot.subtitle = element_text(size = 12, color = "black", 
                                  margin = ggplot2::margin(b = 25, l = -25)
                                     ),
        plot.caption = element_text(size = 8, 
                                    margin = ggplot2::margin(t = 10), 
                                    color = "grey70", hjust = 0))

hdi$`change22-23` =as.numeric(hdi$`change22-23`)

g1 = ggplot(hdi, aes(x = hdi[,2], y = reorder(state, hdi[,2]), color = state)) +
  geom_segment(aes(x = 0, y = state, xend = hdi[,2], yend = state), color = "grey") +
  geom_point() +
  theme(legend.position = "NULL")+
  ggplot2::annotate("text", x = 0.05, y = 20, label = "Positive Growth", color = "#00BFC4", 
           size = 5, hjust = -0.1, vjust = .75) +
  ggplot2::annotate("text", x = 0.05, y = 5, label = "Negative Growth", color = "#F8766D", 
           size = 5, hjust = -0.1, vjust = -.1) +
  geom_vline(xintercept = 0, col = "darkred", linewidth =0.25) + 
  geom_segment(aes(x = 0.035, xend = 0.035, y = 15, yend = 30),
               arrow = arrow(length = unit(0.2,"cm")), 
               linewidth = 1.2,
               color = "#00BFC4") +
  geom_segment(aes(x = 0.035, xend = 0.035 , y = 10, yend = 2),
               arrow = arrow(length = unit(0.2,"cm")), 
               linewidth = 1.2,
               color = "#F8766D") +
  ggtitle("State Economic Performance:
          Year to Year Change 2022-2023")  +
       xlab("Percent Change") +
  ylab("") +
          labs(caption = "Source: CT Dept of Labor, CT Dept Economic & Community Development ")+
  theme_minimal() +
  scale_x_continuous(labels = scales::percent_format()) +
  theme(
    #axis.title = element_blank(),
    #    panel.grid.minor = element_blank(),
        legend.position = "none",
        #text = element_text(family = "Georgia"),
        axis.text.y = element_text(size = 8),
        plot.title = element_text(size = 15, 
                                  margin = ggplot2::margin(b = 10), 
                                  hjust = 0),
        plot.subtitle = element_text(size = 12, color = "darkslategrey", 
                                     margin = ggplot2::margin(b = 25, l = -25)),
        plot.caption = element_text(size = 8, 
                                    margin = ggplot2::margin(t = 10), 
                                    color = "grey70", hjust = 0))


suppressWarnings(print(g1))

library(readxl)

my_custom_name_repair <- function(nms) tolower(gsub(" ", "_", nms))
    #my_custom_name_repair <- function(nms) tolower(nms)


myremovex = function(nms) gsub("x", "", nms)
roster_raw <- read_excel("SEI2010-2023_AROD.xlsx", 
                         sheet = "SEI",
                         range = "A2:O53",
                         #.name_repair = myremovex
                         .name_repair = make_clean_names
                         #.name_repair = my_custom_name_repair
                        )
    
myfixstate = function(nms) gsub(" ", "", nms)
roster_raw$state = myfixstate(roster_raw$state)

hdi = roster_raw |> dplyr::select(state, "x2022", "x2023")
colnames(hdi) = c("state", "2022", "2023")

hdi$`change22-23` = (hdi$`2023` - hdi$`2022`)/hdi$`2022`
hdi$`change22-23` = as.numeric(hdi$`change22-23`)

hdi = as.data.frame(hdi)
## Establishment =====
establish_raw <- read_excel("SEI2010-2023_AROD.xlsx", 
                         sheet = "establishment",
                         range = "AS2:AS53",
                         .name_repair = make_clean_names
)

employment_raw <- read_excel("SEI2010-2023_AROD.xlsx", 
                            sheet = "employment",
                            range = "AS2:AS53",
                            .name_repair = make_clean_names
)


realwage_raw <- read_excel("SEI2010-2023_AROD.xlsx", 
                            sheet = "real wage",
                            range = "AS2:AS53",
                            .name_repair = make_clean_names
)


unemp_raw <- read_excel("SEI2010-2023_AROD.xlsx", 
                            sheet = "unemployment rate",
                            range = "BH2:BH53",
                            .name_repair = make_clean_names
)


hdi = cbind.data.frame(hdi, establish_raw, employment_raw, realwage_raw, unemp_raw)

colnames(hdi)[5:8] = c("establish", "employment", "realwages", "unemployment")

myhdi = hdi |> dplyr::select(-c("2022", "change22-23"))
colnames(myhdi)[2] = "hdi"

last run in the editor is displayed.

g3 = ggplot(hdi) +
  geom_segment( aes(x=reorder(state, hdi[,3]), xend=state, y=hdi[,3], yend=hdi[,2]), color="grey") +
  #geom_point( aes(x=reorder(state, hdi[,7]), y=hdi[,7]), color=rgb(0.5,0.7,0.1,0.5), size=3 ) +
  geom_point( aes(x=reorder(state, hdi[,3]), y=hdi[,3]), color="darkred", size=3 ) +
  geom_point( aes(x=reorder(state, hdi[,3]), y=hdi[,2]), color="darkblue", size=3 ) +
  ggplot2::annotate("text", x = 30, y = 115, 
           label = "2023", 
           color = "darkred", size = 5, hjust = -0.1, vjust = .75) +
  ggplot2::annotate("text", x = 42, y = 100, label = "2022", 
           color = "darkblue", size = 5, hjust = -0.1, vjust = -.1) +
  coord_flip()+
  theme_minimal() +
  #theme(legend.position = "none") +
  xlab("") +
  ylab("Economic Performance Index, 2019 == 100") +
  ggtitle("Economic Performance Index, 2022:2023") +
  theme(
    legend.position = "none",
    #text = element_text(family = "Georgia"),
    axis.text.y = element_text(size = 10),
    plot.title = element_text(size = 16, 
                              #margin = margin(b = 10), 
                              hjust = 0),
    plot.subtitle = element_text(size = 12, color = "black"
    #margin = margin(b = 25, l = -25)
    ),
    plot.caption = element_text(size = 8, 
                                #margin = margin(t = 10), 
                                color = "grey70", hjust = 0)
    )

suppressWarnings(print(g3))

# RF

library(Boruta)

myhdi = myhdi |> column_to_rownames("state")
head(myhdi)
myboruta = Boruta(hdi~., myhdi )
plot(myboruta)

      #cbind(rank(myhdi$hdi), rank(myhdi$unemployment))
      #matplot(cbind(sort(rank(myhdi$hdi)), sort(rank(myhdi$unemployment))), type = "l")
      
      plot(rank(myhdi$hdi), rank(myhdi$unemployment),
           xlab="Rank of State Performance Index", 
           ylab="Rank of State Unemployment Performance"
           , type="n"
           )
      text(rank(myhdi$hdi), rank(myhdi$unemployment), rownames(myhdi))
      abline(lm(rank(myhdi$hdi) ~rank(myhdi$unemployment)))

myhdi_scale = scale(myhdi) |> as.data.frame()
mymod = lm(hdi ~ unemployment +
            establish +
             employment +
             realwages, 
           #data = myhdi_scale
           data = myhdi)
summary(mymod)

Call:
lm(formula = hdi ~ unemployment + establish + employment + realwages, 
    data = myhdi)

Residuals:
                 Min                   1Q               Median 
-0.00000000000015049 -0.00000000000000344  0.00000000000000402 
                  3Q                  Max 
 0.00000000000000871  0.00000000000002395 

Coefficients:
                         Estimate           Std. Error             t value
(Intercept)  0.000000000000055718 0.000000000000155207                0.36
unemployment 0.999999999999999001 0.000000000000000705 1418315641389190.50
establish    0.999999999999998890 0.000000000000001699  588436433807179.25
employment   0.999999999999998224 0.000000000000005238  190921133365290.50
realwages    1.000000000000000888 0.000000000000007302  136943094457266.08
                        Pr(>|t|)    
(Intercept)                 0.72    
unemployment <0.0000000000000002 ***
establish    <0.0000000000000002 ***
employment   <0.0000000000000002 ***
realwages    <0.0000000000000002 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.0000000000000239 on 46 degrees of freedom
Multiple R-squared:     1,  Adjusted R-squared:     1 
F-statistic: 6.05e+29 on 4 and 46 DF,  p-value: <0.0000000000000002
#reghelper::beta(mymod)
barplot(mymod$coefficients[2:5], names.arg = colnames(myhdi[,2:5]))

cor(myhdi)
               hdi establish employment realwages unemployment
hdi          1.000     0.375      0.316     0.266        0.804
establish    0.375     1.000      0.509     0.347       -0.207
employment   0.316     0.509      1.000     0.598       -0.154
realwages    0.266     0.347      0.598     1.000       -0.106
unemployment 0.804    -0.207     -0.154    -0.106        1.000
mymod = lm(rank(myhdi$hdi) ~rank(myhdi$unemployment) +
             rank(myhdi$establish) +
             rank(myhdi$employment) +
             rank(myhdi$realwages))
summary(mymod)

Call:
lm(formula = rank(myhdi$hdi) ~ rank(myhdi$unemployment) + rank(myhdi$establish) + 
    rank(myhdi$employment) + rank(myhdi$realwages))

Residuals:
    Min      1Q  Median      3Q     Max 
-10.405  -2.350   0.009   1.864  15.538 

Coefficients:
                         Estimate Std. Error t value             Pr(>|t|)    
(Intercept)              -14.8792     2.1174   -7.03       0.000000008296 ***
rank(myhdi$unemployment)   0.9018     0.0442   20.38 < 0.0000000000000002 ***
rank(myhdi$establish)      0.4091     0.0485    8.44       0.000000000067 ***
rank(myhdi$employment)     0.1177     0.0576    2.04                0.047 *  
rank(myhdi$realwages)      0.1437     0.0545    2.63                0.011 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 4.55 on 46 degrees of freedom
Multiple R-squared:  0.914, Adjusted R-squared:  0.906 
F-statistic:  122 on 4 and 46 DF,  p-value: <0.0000000000000002
#reghelper::beta(mymod)
barplot(mymod$coefficients[2:5], names.arg = colnames(myhdi[,2:5]))

library(randomForest)
rand_frst <- randomForest(hdi~ 
                            establish + 
                            employment + 
                            realwages + 
                            #realgdp +
                            unemployment, 
                          data=myhdi, 
                          ntree=100, 
                          keep.forest=TRUE, 
                          importance=TRUE)

iscores = randomForest::varImpPlot(rand_frst, 
                         sort=FALSE, 
                         #conditional = TRUE,
                         scale = FALSE,
                         main="Variable Importance Plot")

barplot(iscores[,1])

library(e1071)
svmfit = randomForest(x= myhdi[,2:5],y= myhdi[,1], kernel = "linear", scale = TRUE)
var_imp <- caret::varImp(svmfit, scale = FALSE)
var_imp
#sort the score in decreasing order
var_imp_df <- data.frame(cbind(variable = rownames(var_imp), score = var_imp[,1]))
var_imp_df$score <- as.double(var_imp_df$score)
var_imp_df[order(var_imp_df$score,decreasing = TRUE),]
ggplot(var_imp_df, aes(x=reorder(variable, score), y=score)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=score)) +
  ylab("IncNodePurity") +
  xlab("Variable Name") +
  coord_flip()

library(rminer)
M <- fit(hdi~., data=myhdi, 
         model="svm", 
         #model = "lm",
         kpar=list(sigma=0.10), C=2)
svm.imp <- rminer::Importance(M, data=myhdi)
#svm.imp$imp
barplot(svm.imp$imp[2:5], names.arg = names(myhdi)[2:5])