rSDR vignette

Sheau-Chiann Chen, Shilin Zhao and Hsin-Hsiung Bill Huang

Introduction

This R Markdown document demonstrates the usage of the rSDR package for Robust Sufficient Dimension Reduction (rSDR) using alpha-distance covariance and Stiefel manifold learning for estimation (Hsin-Hsiung Huang, Feng Yu & Teng Zhang, 2024). We apply the method to ionosphere dataset, including cross-validation, bootstrap methods for optimal alpha selection, and visualization of results for both 2 and 3 dimensions.

Setup

# 
# # Load rSDR package
# if (!requireNamespace("rSDR", quietly = TRUE)) {
#   if (!requireNamespace("devtools", quietly = TRUE)){   install.packages("devtools")
# }
#   library(devtools)
#   devtools::install_github("scchen1/rSDR")
#   # or
#   
#   #a local zip file (`rSDR-main.zip`) downloaded from GitHub
#   #devtools::install_local(path='yourpath/rSDR-main.zip')
# }


library(rSDR)

Data Preparation

We use the ionosphere dataset from the fdm2id package. The dataset has 33 predictor variables (V1-V33) and one response variable (V35, ‘g’ for good or ‘b’ for bad).

The data include a response variable Y with nx1 matrix and predictors X with nxp matrix, where n is the number of observation and p is the number of variable.

#Load ionosphere data in fdm2id package
utils::data("ionosphere", package = "fdm2id")

X<-as.matrix(ionosphere[,c(1:33)])
Y<-ifelse(ionosphere[,34]=='b',0,1)
# Y will be used for rSDR
Y<-matrix(Y,length(Y),1)

#Y.factor will be used in plot_rSDR
Y.factor<-factor(ionosphere$V35,levels=c('b','g'),labels=c('Bad','Good'))

Robust Sufficient Dimension Reduction

Example: 3-Dimensional Reduction

set.seed(2435)
sdr_result<-rSDR(X=X, Y=Y, d=3, alpha=0.3,maxiter=1000,tol=1e-7)


# An optimal of C is obtained by maximizing the target function using 
# ManifoldOptim method
head(sdr_result$C_value)
#>              [,1]        [,2]        [,3]
#> [1,]  0.060376161  0.12984351  0.32228204
#> [2,]  0.141480457 -0.09261385  0.08642944
#> [3,] -0.037722305  0.06518416  0.14365305
#> [4,]  0.205080004 -0.30551967 -0.00615399
#> [5,] -0.005404643  0.10617035  0.17880486
#> [6,]  0.225708005  0.06038528 -0.06383628

#The value of cost function f is equal to the negative of the target function.
sdr_result$f_value
#> [1] -0.05426311

#beta=sigma_x^(-0.5)%*%C 
head(sdr_result$beta)
#>              [,1]       [,2]       [,3]
#> [1,]  0.005052794  0.7568295  0.5552178
#> [2,]  0.104607845 -0.4298763  0.1027418
#> [3,] -0.286795008  0.3727621  0.8018906
#> [4,]  0.306400216 -0.7760211 -0.3561266
#> [5,] -0.161184953  0.2073087  0.5865177
#> [6,]  0.297500484  0.5492788 -0.5587245

#projected_data: This projects X onto the estimated SDR directions (X %*% beta),
#the projected matrix (data) n x d
head(sdr_result$projected_data)
#>            V1          V2         V3
#> 1  0.49558138  0.09203527 0.15575243
#> 2  0.22281560  1.53635693 1.15002405
#> 3  0.52522648 -0.25455816 0.07992585
#> 4 -0.12508370  3.09342626 1.02262469
#> 5  0.25594361 -0.02213188 0.17249223
#> 6 -0.09054116  1.13584722 0.86440192

Visualizing the 3D Projected Data

#Plot for 3-dimensional reduction using rSDR

plot_rSDR(projected_data=sdr_result$projected_data,Y=Y.factor,Y.name='group',colors=c("#374E55FF", "#DF8F44FF"))
Plot for 3-dimensional reduction using rSDR
Plot for 3-dimensional reduction using rSDR

Cross-Validation for Optimal Alpha (d=3)

# plan(multisession) will launch parallel workers running in the background
# to save running time. To shut down background workers launched this way, call
# plan(sequential)
# use all local cores except one
# future::plan(future::multisession, workers = future::availableCores() - 1)
# use 2 cores for parallel
future::plan("multisession", workers = 2)
set.seed(2435)
opt_results<-optimal_alpha_cv(alpha.v=c(0.3, 0.5, 0.7),X=X,Y=Y,d=3,kfolds=10)
#> This will take a few seconds or minutes to get the results.
#opt_results<-optimal_alpha_cv(alpha.v=c(0.3, 0.4, 0.5, 0.6, 0.7),X=X,Y=Y,d=3,kfolds=10)

# Optimal alpha using cross validation
opt_results$opt.alpha
#> [1] 0.7

# The cost function values by alpha (column) for each cross validation (row) 
head(opt_results$f_test)
#>              0.3         0.5         0.7
#> [1,] -0.04919316 -0.05200294 -0.04633166
#> [2,] -0.06920450 -0.08929782 -0.09433253
#> [3,] -0.04472843 -0.05253492 -0.05533650
#> [4,] -0.05847677 -0.04592416 -0.05414926
#> [5,] -0.08752592 -0.09585069 -0.09452867
#> [6,] -0.02952418 -0.02512230 -0.02467935

# The mean of cost function value by alpha 
opt_results$f_test.mean
#>         0.3         0.5         0.7 
#> -0.06205520 -0.06120896 -0.06648143

# The standard deviation of cost function value by alpha 
opt_results$f_test.sd
#>        0.3        0.5        0.7 
#> 0.01926043 0.02119046 0.02535447

# The reduced dimension
opt_results$d
#> [1] 3

# Number of folds
opt_results$kfolds
#> [1] 10
plot_alpha(opt_results=opt_results)
Mean and standard deviation of cost function (k-fold cross-validation method)
Mean and standard deviation of cost function (k-fold cross-validation method)

Bootstrap for Optimal Alpha (d=3)

# plan(multisession) will launch parallel workers running in the background
# to save running time. To shut down background workers launched this way, call
# plan(sequential)
# use all local cores except one
# future::plan(future::multisession, workers = future::availableCores() - 1)
# use 2 cores for parallel
future::plan("multisession", workers = 2)
set.seed(2435)
opt_results<-optimal_alpha_boot(alpha.v=c(0.3,0.5, 0.7),X=X,Y=Y,d=3,R=10)
#> This will take a few seconds or minutes to get the results.
#opt_results<-optimal_alpha_boot(alpha.v=c(0.3,0.4,0.5,0.6, 0.7),X=X,Y=Y,d=3,R=50)

# Optimal alpha using bootstrap method
opt_results$opt.alpha
#> [1] 0.7

# The cost function values by alpha (column) for R bootstrap replicates (row) 
head(opt_results$f_test)
#>              0.3         0.5         0.7
#> [1,] -0.04123228 -0.06510945 -0.06349116
#> [2,] -0.05346989 -0.05615554 -0.05670479
#> [3,] -0.06034808 -0.06623436 -0.07687898
#> [4,] -0.06069561 -0.06189878 -0.05231286
#> [5,] -0.06213373 -0.04805119 -0.05082178
#> [6,] -0.05106833 -0.04766037 -0.05188302

# The mean of cost function value by alpha 
opt_results$f_test.mean
#>         0.3         0.5         0.7 
#> -0.05348157 -0.05805082 -0.05836145

# The standard deviation of cost function value by alpha 
opt_results$f_test.sd
#>         0.3         0.5         0.7 
#> 0.007649731 0.007248535 0.008392497

# The reduced dimension
opt_results$d
#> [1] 3

# Number of bootstrap replicates
opt_results$R
#> [1] 10
plot_alpha(opt_results=opt_results)
Mean and standard deviation of cost function (bootstrap method)
Mean and standard deviation of cost function (bootstrap method)

Example: 2-Dimensional Reduction and Plotting

set.seed(2435)
sdr_result<-rSDR(X=X, Y=Y, d=2, alpha=0.3,maxiter=1000,tol=1e-7)

# An optimal of C is obtained by maximizing the target function using ManifoldOptim method
head(sdr_result$C_value)
#>              [,1]        [,2]
#> [1,] -0.195840175  0.01920542
#> [2,] -0.008489214  0.36357606
#> [3,] -0.089659622 -0.06152668
#> [4,]  0.025908724  0.35114685
#> [5,]  0.206643972  0.12064514
#> [6,] -0.317245433  0.13294555

#The value of cost function f is equal to the negative of the target function.
sdr_result$f_value
#> [1] -0.04539063

#beta=sigma_x^(-0.5)%*%C 
head(sdr_result$beta)
#>            [,1]        [,2]
#> [1,] -0.4503527 -0.46366907
#> [2,]  0.4871003  0.93057571
#> [3,] -0.1964792  0.06231069
#> [4,] -0.1416008  0.54261663
#> [5,]  0.8963119  0.19068888
#> [6,] -1.0070804 -0.46193622

#projected_data: This projects X onto the estimated SDR directions (X %*% beta), the projected matrix (data) n x d
head(sdr_result$projected_data)
#>           V1         V2
#> 1 -0.3147479  0.3907114
#> 2  0.5904977  0.8874504
#> 3 -0.2551743  0.3706773
#> 4 -0.1722560 -1.2104044
#> 5 -0.4147929  0.3816966
#> 6 -0.2916534 -0.4002997
#Plot for 2-dimensional reduction using rSDR
plot_rSDR(projected_data=sdr_result$projected_data,Y=Y.factor,Y.name='group',colors=c("#374E55FF", "#DF8F44FF"))

Conclusion

This document tested the core functionality of the rSDR package, including:

The package successfully processes the ionosphere dataset, optimizes the alpha parameter, and produces meaningful visualizations of the reduced dimensions.

Reference

Hsin-Hsiung Huang, Feng Yu & Teng Zhang (19 Feb 2024): Robust sufficient dimension reduction via α-distance covariance, Journal of Nonparametric Statistics, DOI:10.1080/10485252.2024.2313137