Introduction
Lately I have been indulged in learning all things tidymodels in my after office hours. But I was missing something - the effectiveness of my learning journey. Committing to large scale competition was unwieldy but then came #Sliced - a data science problem solving 2-hour sprint with small datasets.
Here, I am trying to tackle the S0102 problem with Aircraft wildlife strikes dataset.
Load Libraries and other presets
library(tidyverse) # Data Wrangling
library(tidymodels) # Modelling
library(themis) # Class imbalance
library(parallel) # Parallel operations
library(doParallel) # Parallel operations
library(tictoc) # Timing
library(GGally) # For pair plots
Enable parallel processing
cores<-detectCores(logical=F)-1
# cores
core_cluster<-makePSOCKcluster(cores)
# core_cluster
registerDoParallel(core_cluster)
Get Train and test data
Here I did a few things other than reading data - * Converted all character columns to factors to play nice with different models * Releveled factor so that our target outcome true label is “damaged”. It will make it easier to read the results.
Train data
train_orig<-read_csv("Data/S01E02/train.csv",
guess_max=1e5)%>%
mutate(
damaged=case_when(
damaged>0 ~ "damaged",
TRUE ~ "no damage"
),
across(where(is.character),as_factor),
damaged=fct_relevel(damaged,"damaged")
)
# Check out training data
train_orig%>%
glimpse()
## Rows: 21,000
## Columns: 34
## $ id <dbl> 23637, 8075, 5623, 19605, 15142, 27235, 12726, 20781,~
## $ incident_year <dbl> 1996, 1999, 2011, 2007, 2007, 2013, 2002, 2013, 2015,~
## $ incident_month <dbl> 11, 6, 12, 9, 9, 5, 5, 5, 7, 8, 10, 9, 11, 7, 5, 3, 3~
## $ incident_day <dbl> 7, 26, 1, 13, 13, 28, 4, 19, 22, 22, 21, 7, 2, 7, 20,~
## $ operator_id <fct> MIL, UAL, SWA, SWA, MIL, UNK, UAL, BUS, UNK, BUS, EGF~
## $ operator <fct> MILITARY, UNITED AIRLINES, SOUTHWEST AIRLINES, SOUTHW~
## $ aircraft <fct> T-1A, B-757-200, B-737-300, B-737-700, KC-135R, UNKNO~
## $ aircraft_type <fct> A, A, A, A, A, NA, A, A, NA, A, A, A, A, NA, A, A, A,~
## $ aircraft_make <fct> 748, 148, 148, 148, NA, NA, 148, 226, NA, NA, 332, 58~
## $ aircraft_model <dbl> NA, 26, 24, 42, NA, NA, 97, 7, NA, NA, 14, 22, 37, NA~
## $ aircraft_mass <dbl> 3, 4, 4, 4, NA, NA, 4, 1, NA, 1, 3, 4, 4, NA, 4, 4, 4~
## $ engine_make <dbl> 31, 34, 10, 10, NA, NA, 34, 7, NA, NA, 1, 34, 34, NA,~
## $ engine_model <fct> 1, 40, 1, 1, NA, NA, 46, 10, NA, NA, 10, 10, 10, NA, ~
## $ engines <dbl> 2, 2, 2, 2, NA, NA, 2, 1, NA, 2, 2, 2, 2, NA, 2, 2, 2~
## $ engine_type <fct> D, D, D, D, NA, NA, D, A, NA, C, D, D, D, NA, D, D, D~
## $ engine1_position <dbl> 5, 1, 1, 1, NA, NA, 1, 7, NA, 3, 5, 5, 5, NA, 1, 1, 5~
## $ engine2_position <dbl> 5, 1, 1, 1, NA, NA, 1, NA, NA, 3, 5, 5, 5, NA, 1, 1, ~
## $ engine3_position <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
## $ engine4_position <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
## $ airport_id <fct> KLBB, ZZZZ, KOAK, KSAT, KGFK, KMDT, KJFK, KIGQ, KEWR,~
## $ airport <fct> "LUBBOCK PRESTON SMITH INTL ARPT", "UNKNOWN", "METRO ~
## $ state <fct> TX, NA, CA, TX, ND, PA, NY, IL, NJ, UT, MO, MO, AZ, C~
## $ faa_region <fct> ASW, NA, AWP, ASW, AGL, AEA, AEA, AGL, AEA, ANM, ACE,~
## $ flight_phase <fct> LANDING ROLL, NA, LANDING ROLL, APPROACH, APPROACH, N~
## $ visibility <fct> DAY, NA, DAY, NIGHT, NIGHT, NA, NA, NIGHT, NA, NIGHT,~
## $ precipitation <fct> NA, NA, "NONE", "NONE", NA, NA, NA, "FOG", NA, NA, "N~
## $ height <dbl> 0, NA, 0, 300, NA, NA, NA, 2700, NA, 0, 3500, 1400, 0~
## $ speed <dbl> 80, NA, NA, 130, 140, NA, NA, 110, NA, NA, 180, 170, ~
## $ distance <dbl> 0, NA, 0, NA, NA, 0, NA, NA, 0, 0, NA, NA, 0, 0, 0, 0~
## $ species_id <fct> UNKBM, UNKBM, ZT002, UNKBS, ZT105, YI005, UNKBM, UNKB~
## $ species_name <fct> "UNKNOWN MEDIUM BIRD", "UNKNOWN MEDIUM BIRD", "WESTER~
## $ species_quantity <fct> 1, 1, 1, 1, NA, 1, 1, 1, 1, 1, 1, 2-10, 1, 1, 2-10, 1~
## $ flight_impact <fct> NA, NA, NONE, NONE, NA, NA, NA, NONE, NA, NA, NONE, N~
## $ damaged <fct> no damage, damaged, no damage, no damage, no damage, ~
Test data
test<-read_csv("Data/S01E02/test.csv",
guess_max = 1e5
)%>%
mutate(
across(where(is.character),as_factor)
)
Exploratory Data Analysis
Here I am not much focusing on EDA. But some charts will not hurt. Taken some queue from Julia Silge’s post and added some of my own on top.
Class balance check
The outcome is severely imbalanced. We will address that in pre-processing step.
train_orig%>%
count(damaged)%>%
ggplot(aes(damaged,n,fill=damaged))+
geom_col()+
geom_text(aes(label=n))
balance_share<-train_orig%>%
count(damaged)%>%
mutate(
share=n/sum(n)
)%>%
slice_max(share)%>%
select(share)%>%
pull()
balance_share # Will be using this on another viz
## [1] 0.9143333
Checking Pair plots of numeric variables
So many variables made the plot ugly. We see that some variables like speed, height . But for this model I will be skipping this pre-processing.
train_orig%>%
select(damaged,incident_year,aircraft_mass,
engines,contains("_position"),
height,speed,distance
)%>%
ggpairs(columns = 2:11,
aes(color=damaged,alpha=0.5),
progress=FALSE
)
Categorical variables exploration
So many categorical variables! Lets visualize those with less than 15 levels and see whether they are inline with the outcome imbalance.
categoric_summary<-train_orig%>%
select(where(is_character),where(is.factor))%>%
map_df(~tibble(
distinct=n_distinct(.x)
),
.id="variable"
)%>%
arrange(-distinct)
categoric_summary
## # A tibble: 20 x 2
## variable distinct
## <chr> <int>
## 1 airport_id 1039
## 2 airport 1039
## 3 species_id 447
## 4 species_name 446
## 5 aircraft 424
## 6 operator_id 276
## 7 operator 275
## 8 aircraft_make 63
## 9 state 61
## 10 engine_model 40
## 11 faa_region 15
## 12 flight_phase 13
## 13 engine_type 9
## 14 precipitation 9
## 15 flight_impact 7
## 16 visibility 6
## 17 engine3_position 5
## 18 species_quantity 5
## 19 aircraft_type 3
## 20 damaged 2
train_orig%>%
select(damaged,categoric_summary%>%
filter(distinct<=15)%>%
select(variable)%>%
pull()
)%>%
pivot_longer(2:last_col())%>%
ggplot(aes(y=value,fill=damaged))+
geom_bar(position="fill")+
geom_vline(xintercept = balance_share)+
facet_wrap(vars(name),scales="free",ncol=2)+
labs(
x=NULL,y=NULL,fill=NULL
)
Closer inspection indeed indicates that some levels may provide better predictive capability. Dummy variables encoding may work wonders.
Limited variables dataframe
Instead of lots of variables, I choose to use some select variables (taking cues from other bloggers and adding some of mine) for my model.
train_df<-train_orig%>%
select(
damaged, flight_impact, precipitation,
visibility, flight_phase, engines, incident_year,
species_id, engine_type,aircraft_model,
species_quantity, height, speed, distance
)
train_df%>%
glimpse()
## Rows: 21,000
## Columns: 14
## $ damaged <fct> no damage, damaged, no damage, no damage, no damage, ~
## $ flight_impact <fct> NA, NA, NONE, NONE, NA, NA, NA, NONE, NA, NA, NONE, N~
## $ precipitation <fct> NA, NA, "NONE", "NONE", NA, NA, NA, "FOG", NA, NA, "N~
## $ visibility <fct> DAY, NA, DAY, NIGHT, NIGHT, NA, NA, NIGHT, NA, NIGHT,~
## $ flight_phase <fct> LANDING ROLL, NA, LANDING ROLL, APPROACH, APPROACH, N~
## $ engines <dbl> 2, 2, 2, 2, NA, NA, 2, 1, NA, 2, 2, 2, 2, NA, 2, 2, 2~
## $ incident_year <dbl> 1996, 1999, 2011, 2007, 2007, 2013, 2002, 2013, 2015,~
## $ species_id <fct> UNKBM, UNKBM, ZT002, UNKBS, ZT105, YI005, UNKBM, UNKB~
## $ engine_type <fct> D, D, D, D, NA, NA, D, A, NA, C, D, D, D, NA, D, D, D~
## $ aircraft_model <dbl> NA, 26, 24, 42, NA, NA, 97, 7, NA, NA, 14, 22, 37, NA~
## $ species_quantity <fct> 1, 1, 1, 1, NA, 1, 1, 1, 1, 1, 1, 2-10, 1, 1, 2-10, 1~
## $ height <dbl> 0, NA, 0, 300, NA, NA, NA, 2700, NA, 0, 3500, 1400, 0~
## $ speed <dbl> 80, NA, NA, 130, 140, NA, NA, 110, NA, NA, 180, 170, ~
## $ distance <dbl> 0, NA, 0, NA, NA, 0, NA, NA, 0, 0, NA, NA, 0, 0, 0, 0~
Partitioning
Here I split the original training data into train and validation set. Then resample the train set with 10-fold stratified cross validation. Will use the validation set for another layer of checking of the model.
set.seed(123)
splits<-initial_split(train_df,strata = damaged)
train<-training(splits)
valid<-testing(splits)
folds<-vfold_cv(train,v = 10,strata = damaged)
Recipe to preprocess
Here the strategy is - * Handle factors with many levels by lumping them together * Removing highly correlated numeric variables * Dummy encoding * Imputation, zero variance feature removal * Wanted to upsample. But results were poor. So, keeping that option out
rec_base<-train%>%
recipe(damaged~.)%>%
step_corr(all_numeric_predictors())%>%
step_novel(all_nominal_predictors())%>%
step_other(all_nominal_predictors(),threshold=0.01)%>%
step_unknown(all_nominal_predictors())%>%
step_dummy(all_nominal_predictors())%>%
step_impute_median(all_numeric_predictors())%>%
step_zv(all_predictors())
rec_base
## Data Recipe
##
## Inputs:
##
## role #variables
## outcome 1
## predictor 13
##
## Operations:
##
## Correlation filter on all_numeric_predictors()
## Novel factor level assignment for all_nominal_predictors()
## Collapsing factor levels for all_nominal_predictors()
## Unknown factor level assignment for all_nominal_predictors()
## Dummy variables from all_nominal_predictors()
## Median Imputation for all_numeric_predictors()
## Zero variance filter on all_predictors()
Model to use - GLMNet
We set two tunable parameters in GLMNet model.
logistic_reg_glmnet_spec <-
logistic_reg(penalty = tune(), mixture = tune()) %>%
set_engine('glmnet')
Create Workflow
LR_glmnet_wf<-workflow()%>%
add_recipe(rec_base)%>%
add_model(logistic_reg_glmnet_spec)
LR_glmnet_wf
## == Workflow ====================================================================
## Preprocessor: Recipe
## Model: logistic_reg()
##
## -- Preprocessor ----------------------------------------------------------------
## 7 Recipe Steps
##
## * step_corr()
## * step_novel()
## * step_other()
## * step_unknown()
## * step_dummy()
## * step_impute_median()
## * step_zv()
##
## -- Model -----------------------------------------------------------------------
## Logistic Regression Model Specification (classification)
##
## Main Arguments:
## penalty = tune()
## mixture = tune()
##
## Computational engine: glmnet
Tune grid for hyperparameter tuning
We vary with 20 different combination of penalty and mixture.
LR_glmnet_grid<-grid_max_entropy(
parameters(LR_glmnet_wf),
size=20
)
LR_glmnet_grid%>%
head()
## # A tibble: 6 x 2
## penalty mixture
## <dbl> <dbl>
## 1 2.50e-10 0.0883
## 2 5.95e- 7 0.669
## 3 2.20e- 6 0.451
## 4 3.76e- 9 0.973
## 5 7.92e- 5 0.0864
## 6 2.28e- 4 0.777
Tuning
tic()
metrics_interest<-metric_set(mn_log_loss)
set.seed(123)
LR_glmnet_tune<-LR_glmnet_wf%>%
tune_grid(
resamples=folds,
grid=LR_glmnet_grid,
metrics=metrics_interest,
control=control_grid(parallel_over =NULL)
)
toc()
## 176.88 sec elapsed
LR_glmnet_tune%>%
collect_metrics()%>%
slice_min(mean,n=5)
## # A tibble: 5 x 8
## penalty mixture .metric .estimator mean n std_err .config
## <dbl> <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 4.59e- 4 0.978 mn_log_loss binary 0.216 10 0.00477 Preprocessor1_Mod~
## 2 8.38e- 4 0.404 mn_log_loss binary 0.216 10 0.00476 Preprocessor1_Mod~
## 3 2.28e- 4 0.777 mn_log_loss binary 0.216 10 0.00478 Preprocessor1_Mod~
## 4 1.09e-10 0.227 mn_log_loss binary 0.216 10 0.00479 Preprocessor1_Mod~
## 5 8.99e- 7 0.213 mn_log_loss binary 0.216 10 0.00478 Preprocessor1_Mod~
Get the best parameters and fit on full training set
best_params<-LR_glmnet_tune%>%select_best()
best_params
## # A tibble: 1 x 3
## penalty mixture .config
## <dbl> <dbl> <chr>
## 1 0.000459 0.978 Preprocessor1_Model20
final_wf_fit<-LR_glmnet_wf%>%
finalize_workflow(best_params)%>%
fit(data=train)
Predict on validation sets
valid_preds<-final_wf_fit%>%
augment(valid)
valid_preds%>%
metrics_interest(truth=damaged,estimate=.pred_class,.pred_damaged)
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 mn_log_loss binary 0.221
Validation metrics are pretty close to our resampling metrics. So, we may not be overfitting.
Predict on test set and submission file
test_preds<-final_wf_fit%>%
augment(test)%>%
select(id,damaged=.pred_damaged)
test_preds%>%head()
## # A tibble: 6 x 2
## id damaged
## <dbl> <dbl>
## 1 11254 0.0902
## 2 27716 0.0149
## 3 29066 0.0218
## 4 3373 0.0724
## 5 1996 0.0684
## 6 18061 0.00466
stopCluster(core_cluster)
Conclusion
This yielded a score of 0.219 in private leaderboard and 0.165 in public leaderboard. You can find details submission stats on my Kaggle submission notebook.
Sharing is caring. Share this story in...
Share: Twitter Facebook LinkedIn Pocket Flipboard