Advanced Leakage Detection with leakr

Cheryl Isabella Lim

2025-10-22

library(leakr)
#> 
#> Attaching package: 'leakr'
#> The following object is masked from 'package:base':
#> 
#>     %||%

Introduction

This vignette explores advanced usage patterns for the leakr package, demonstrating how to detect subtle leakage patterns and customise the detection process for specific scenarios. We’ll cover complex datasets, configuration options, and best practices for comprehensive leakage detection.

Understanding leakr’s Detection Capabilities

The leakr package can identify various types of data leakage that might compromise model validity:

# View available detectors
available_detectors <- list_registered_detectors()
print(available_detectors)
#> [1] "file_format"                "and_convert_dates_enhanced"

Advanced Target Leakage Scenarios

Target leakage can be subtle and context-dependent. Let’s explore some realistic scenarios:

Medical Diagnosis Example

# Create a medical dataset with subtle leakage
set.seed(456)
n <- 500

medical_data <- data.frame(
  patient_id = 1:n,
  age = sample(25:75, n, replace = TRUE),
  bmi = rnorm(n, 25, 4),
  blood_pressure = rnorm(n, 120, 15),
  diagnosis = factor(sample(c("healthy", "diseased"), n, replace = TRUE, prob = c(0.8, 0.2)))
)

# Add a leaky feature: treatment_received (only available post-diagnosis)
medical_data$treatment_received <- ifelse(
  medical_data$diagnosis == "diseased", 
  sample(c("yes", "no"), sum(medical_data$diagnosis == "diseased"), replace = TRUE, prob = c(0.9, 0.1)),
  "no"
)

# Audit the medical data
medical_report <- leakr_audit(
  data = medical_data,
  target = "diagnosis",
  id = "patient_id"
)

print(medical_report)
#> $summary
#> data frame with 0 columns and 0 rows
#> 
#> $evidence
#> list()
#> 
#> $meta
#> $meta$n_detectors
#> [1] 0
#> 
#> $meta$n_issues
#> [1] 0
#> 
#> $meta$data_shape
#> [1] 500   6
#> 
#> $meta$original_data_shape
#> [1] 500   6
#> 
#> $meta$was_sampled
#> [1] FALSE
#> 
#> $meta$detectors_run
#> NULL
#> 
#> $meta$timestamp
#> [1] "2025-10-22 10:43:35 CEST"
#> 
#> $meta$config_used
#> $meta$config_used$sample_size
#> [1] 50000
#> 
#> $meta$config_used$correlation_threshold
#> [1] 0.8
#> 
#> $meta$config_used$contamination_threshold
#> [1] 0.1
#> 
#> $meta$config_used$numeric_severity
#> [1] TRUE
#> 
#> $meta$config_used$plot_results
#> [1] FALSE
#> 
#> $meta$config_used$parallel
#> [1] FALSE
#> 
#> $meta$config_used$seed
#> [1] 123
#> 
#> 
#> 
#> attr(,"class")
#> [1] "leakr_report"

Financial Data with Temporal Issues

# Create financial data with temporal leakage
set.seed(789)
dates <- seq(as.Date("2020-01-01"), as.Date("2023-12-31"), by = "month")

financial_data <- data.frame(
  account_id = 1:200,
  transaction_date = sample(dates, 200, replace = TRUE),
  amount = rlnorm(200, 4, 1),
  account_balance = rnorm(200, 1000, 500),
  default_risk = factor(sample(c("low", "high"), 200, replace = TRUE))
)

# Sort by date
financial_data <- financial_data[order(financial_data$transaction_date), ]

# Add feature that uses future information (credit score after default assessment)
financial_data$credit_score_updated <- ifelse(
  financial_data$default_risk == "high",
  rnorm(sum(financial_data$default_risk == "high"), 450, 50),
  rnorm(sum(financial_data$default_risk == "low"), 750, 75)
)

# Create temporal split
financial_data$split <- ifelse(
  financial_data$transaction_date < as.Date("2022-01-01"), 
  "train", 
  "test"
)

# Audit financial data
financial_report <- leakr_audit(
  data = financial_data,
  target = "default_risk",
  split = "split",
  id = "account_id"
)

print(financial_report)
#> $summary
#> data frame with 0 columns and 0 rows
#> 
#> $evidence
#> list()
#> 
#> $meta
#> $meta$n_detectors
#> [1] 0
#> 
#> $meta$n_issues
#> [1] 0
#> 
#> $meta$data_shape
#> [1] 200   7
#> 
#> $meta$original_data_shape
#> [1] 200   7
#> 
#> $meta$was_sampled
#> [1] FALSE
#> 
#> $meta$detectors_run
#> NULL
#> 
#> $meta$timestamp
#> [1] "2025-10-22 10:43:35 CEST"
#> 
#> $meta$config_used
#> $meta$config_used$sample_size
#> [1] 50000
#> 
#> $meta$config_used$correlation_threshold
#> [1] 0.8
#> 
#> $meta$config_used$contamination_threshold
#> [1] 0.1
#> 
#> $meta$config_used$numeric_severity
#> [1] TRUE
#> 
#> $meta$config_used$plot_results
#> [1] FALSE
#> 
#> $meta$config_used$parallel
#> [1] FALSE
#> 
#> $meta$config_used$seed
#> [1] 123
#> 
#> 
#> 
#> attr(,"class")
#> [1] "leakr_report"

Advanced Duplication Detection

Near-Duplicate Detection in Customer Data

# Create customer dataset with near-duplicates
set.seed(321)

# Original customers
customers <- data.frame(
  name = c("John Smith", "Jane Doe", "Bob Johnson", "Alice Brown", "Charlie Davis"),
  email = c("john@email.com", "jane@email.com", "bob@email.com", "alice@email.com", "charlie@email.com"),
  age = c(35, 28, 42, 31, 39),
  income = c(50000, 45000, 75000, 55000, 62000),
  purchase_category = factor(c("electronics", "books", "clothing", "electronics", "books"))
)

# Create near-duplicates with slight variations
near_dupes <- customers[1:3, ]
near_dupes$name <- c("J Smith", "Jane D", "Robert Johnson")  # Name variations
near_dupes$email <- c("john.smith@email.com", "j.doe@email.com", "bob.johnson@email.com")  # Email variations
near_dupes$age <- near_dupes$age + c(1, 0, -1)  # Age variations

# Combine datasets
all_customers <- rbind(customers, near_dupes)
all_customers$customer_id <- 1:nrow(all_customers)

# Audit for duplicates
dup_report <- leakr_audit(
  data = all_customers,
  target = "purchase_category",
  id = "customer_id"
)

print(dup_report)
#> $summary
#> data frame with 0 columns and 0 rows
#> 
#> $evidence
#> list()
#> 
#> $meta
#> $meta$n_detectors
#> [1] 0
#> 
#> $meta$n_issues
#> [1] 0
#> 
#> $meta$data_shape
#> [1] 8 6
#> 
#> $meta$original_data_shape
#> [1] 8 6
#> 
#> $meta$was_sampled
#> [1] FALSE
#> 
#> $meta$detectors_run
#> NULL
#> 
#> $meta$timestamp
#> [1] "2025-10-22 10:43:35 CEST"
#> 
#> $meta$config_used
#> $meta$config_used$sample_size
#> [1] 50000
#> 
#> $meta$config_used$correlation_threshold
#> [1] 0.8
#> 
#> $meta$config_used$contamination_threshold
#> [1] 0.1
#> 
#> $meta$config_used$numeric_severity
#> [1] TRUE
#> 
#> $meta$config_used$plot_results
#> [1] FALSE
#> 
#> $meta$config_used$parallel
#> [1] FALSE
#> 
#> $meta$config_used$seed
#> [1] 123
#> 
#> 
#> 
#> attr(,"class")
#> [1] "leakr_report"

Configuration and Customisation

Custom Configuration Options

# Example of custom configuration for sensitive detection
sensitive_config <- list(
  sample_size = 5000,           # Limit sample size for large datasets
  correlation_threshold = 0.7,   # Lower threshold for correlation-based detection
  duplicate_threshold = 0.9      # Threshold for considering records as duplicates
)

# Apply custom configuration
iris_sensitive <- leakr_audit(
  data = iris,
  target = "Species",
  config = sensitive_config
)

print(iris_sensitive)
#> $summary
#> data frame with 0 columns and 0 rows
#> 
#> $evidence
#> list()
#> 
#> $meta
#> $meta$n_detectors
#> [1] 0
#> 
#> $meta$n_issues
#> [1] 0
#> 
#> $meta$data_shape
#> [1] 150   5
#> 
#> $meta$original_data_shape
#> [1] 150   5
#> 
#> $meta$was_sampled
#> [1] FALSE
#> 
#> $meta$detectors_run
#> NULL
#> 
#> $meta$timestamp
#> [1] "2025-10-22 10:43:36 CEST"
#> 
#> $meta$config_used
#> $meta$config_used$sample_size
#> [1] 5000
#> 
#> $meta$config_used$correlation_threshold
#> [1] 0.7
#> 
#> $meta$config_used$contamination_threshold
#> [1] 0.1
#> 
#> $meta$config_used$numeric_severity
#> [1] TRUE
#> 
#> $meta$config_used$plot_results
#> [1] FALSE
#> 
#> $meta$config_used$parallel
#> [1] FALSE
#> 
#> $meta$config_used$seed
#> [1] 123
#> 
#> $meta$config_used$duplicate_threshold
#> [1] 0.9
#> 
#> 
#> 
#> attr(,"class")
#> [1] "leakr_report"

Working with Large Datasets

Stratified Sampling for Balanced Analysis

# Create a large imbalanced dataset
set.seed(555)
large_n <- 10000

large_data <- data.frame(
  feature1 = rnorm(large_n),
  feature2 = sample(letters[1:10], large_n, replace = TRUE),
  feature3 = rnorm(large_n, 100, 20),
  # Imbalanced target
  target = factor(sample(c("rare", "common"), large_n, replace = TRUE, prob = c(0.05, 0.95)))
)

# Use stratified sampling to ensure representation
sample_indices <- stratified_sample(large_data$target, 1000)
sampled_data <- large_data[sample_indices, ]

# Verify sampling maintained class balance
table(large_data$target)
#> 
#> common   rare 
#>   9504    496
table(sampled_data$target)
#> 
#> common   rare 
#>    950     50

# Audit sampled data
large_report <- leakr_audit(
  data = sampled_data,
  target = "target"
)

print(large_report)
#> $summary
#> data frame with 0 columns and 0 rows
#> 
#> $evidence
#> list()
#> 
#> $meta
#> $meta$n_detectors
#> [1] 0
#> 
#> $meta$n_issues
#> [1] 0
#> 
#> $meta$data_shape
#> [1] 1000    4
#> 
#> $meta$original_data_shape
#> [1] 1000    4
#> 
#> $meta$was_sampled
#> [1] FALSE
#> 
#> $meta$detectors_run
#> NULL
#> 
#> $meta$timestamp
#> [1] "2025-10-22 10:43:36 CEST"
#> 
#> $meta$config_used
#> $meta$config_used$sample_size
#> [1] 50000
#> 
#> $meta$config_used$correlation_threshold
#> [1] 0.8
#> 
#> $meta$config_used$contamination_threshold
#> [1] 0.1
#> 
#> $meta$config_used$numeric_severity
#> [1] TRUE
#> 
#> $meta$config_used$plot_results
#> [1] FALSE
#> 
#> $meta$config_used$parallel
#> [1] FALSE
#> 
#> $meta$config_used$seed
#> [1] 123
#> 
#> 
#> 
#> attr(,"class")
#> [1] "leakr_report"

Advanced Reporting and Analysis

Detailed Report Analysis

# Create complex dataset for comprehensive analysis
complex_data <- data.frame(
  id = 1:300,
  timestamp = seq(as.POSIXct("2023-01-01"), as.POSIXct("2023-12-31"), length.out = 300),
  feature_a = rnorm(300),
  feature_b = sample(LETTERS[1:5], 300, replace = TRUE),
  feature_c = rnorm(300, 50, 10),
  outcome = factor(sample(c("success", "failure"), 300, replace = TRUE))
)

# Add intentional leakage for demonstration
complex_data$leaky_feature <- ifelse(complex_data$outcome == "success", 1, 0)

# Generate comprehensive audit
detailed_report <- leakr_audit(
  data = complex_data,
  target = "outcome",
  id = "id"
)

# Generate detailed summary
detailed_summary <- leakr_summarise(detailed_report, top_n = 10, show_config = TRUE)
#> Leakage Audit Report
#> ===================
#> Data shape: 300 x 7 
#> Detectors run:  
#> Timestamp: 2025-10-22 10:43:36 
#> 
#> ✓ No leakage issues detected.
print(detailed_summary)
#> data frame with 0 columns and 0 rows

Best Practices for Advanced Usage

1. Multi-Stage Validation

Implement a systematic approach to leakage detection:

# Multi-stage validation function
comprehensive_validation <- function(data, target, id = NULL, split = NULL) {
  
  cat("Stage 1: Basic data validation\n")
  # Basic preprocessing and validation
  clean_data <- validate_and_preprocess_data(data, target, split, id)
  
  cat("Stage 2: Initial leakage screening\n")
  # Quick initial screening
  initial_report <- leakr_audit(clean_data, target = target, split = split, id = id)
  
  cat("Stage 3: Detailed analysis\n")
  # Generate detailed summary
  summary_report <- leakr_summarise(initial_report, top_n = 15, show_config = TRUE)
  
  # Count critical issues
  if(length(initial_report$issues) > 0) {
    critical_count <- sum(sapply(initial_report$issues, function(x) 
      !is.null(x$severity) && x$severity == "high"))
    
    if(critical_count > 0) {
      cat("WARNING:", critical_count, "critical issues detected!\n")
    }
  }
  
  return(list(
    data = clean_data,
    audit = initial_report,
    summary = summary_report
  ))
}

# Example usage
# validation_result <- comprehensive_validation(your_data, "target_column")

2. Domain-Specific Validation

Adapt validation to your specific domain:

# Example: E-commerce specific validation
ecommerce_validation <- function(data, target) {
  
  # Standard audit
  base_report <- leakr_audit(data, target = target)
  
  # Domain-specific checks
  issues <- list()
  
  # Check for post-purchase features
  post_purchase_patterns <- c("return", "refund", "satisfaction", "rating")
  feature_names <- names(data)
  
  for(pattern in post_purchase_patterns) {
    matching_features <- grep(pattern, feature_names, value = TRUE, ignore.case = TRUE)
    if(length(matching_features) > 0) {
      issues <- append(issues, paste("Potential post-purchase feature:", 
                                   paste(matching_features, collapse = ", ")))
    }
  }
  
  if(length(issues) > 0) {
    cat("Domain-specific warnings:\n")
    for(issue in issues) {
      cat("-", issue, "\n")
    }
  }
  
  return(base_report)
}

# Example e-commerce data
ecommerce_data <- data.frame(
  customer_id = 1:100,
  purchase_amount = rlnorm(100, 4, 1),
  product_category = sample(c("electronics", "books", "clothing"), 100, replace = TRUE),
  customer_satisfaction = sample(1:5, 100, replace = TRUE),  # Post-purchase!
  will_repurchase = factor(sample(c("yes", "no"), 100, replace = TRUE))
)

# Validate e-commerce data
ecommerce_report <- ecommerce_validation(ecommerce_data, "will_repurchase")
#> Domain-specific warnings:
#> - Potential post-purchase feature: customer_satisfaction

Summary

This vignette has demonstrated advanced leakage detection techniques including:

The key to effective leakage detection is understanding your data domain and systematically applying appropriate detection techniques. leakr provides the flexibility to adapt these techniques to your specific requirements whilst maintaining robust detection capabilities.

For integration with popular ML frameworks, see the “Framework Integration” vignette.