Advanced Leakage Detection with leakr

Advanced Target Leakage Scenarios

Target leakage can be subtle and context-dependent. Let’s explore some realistic scenarios:

Medical Diagnosis Example

# Create a medical dataset with subtle leakage
set.seed(456)
n <- 500

medical_data <- data.frame(
  patient_id = 1:n,
  age = sample(25:75, n, replace = TRUE),
  bmi = rnorm(n, 25, 4),
  blood_pressure = rnorm(n, 120, 15),
  diagnosis = factor(sample(c("healthy", "diseased"), n, replace = TRUE, prob = c(0.8, 0.2)))
)

# Add a leaky feature: treatment_received (only available post-diagnosis)
medical_data$treatment_received <- ifelse(
  medical_data$diagnosis == "diseased", 
  sample(c("yes", "no"), sum(medical_data$diagnosis == "diseased"), replace = TRUE, prob = c(0.9, 0.1)),
  "no"
)

# Audit the medical data
medical_report <- leakr_audit(
  data = medical_data,
  target = "diagnosis",
  id = "patient_id"
)

print(medical_report)
#> $summary
#> data frame with 0 columns and 0 rows
#> 
#> $evidence
#> list()
#> 
#> $meta
#> $meta$n_detectors
#> [1] 0
#> 
#> $meta$n_issues
#> [1] 0
#> 
#> $meta$data_shape
#> [1] 500   6
#> 
#> $meta$original_data_shape
#> [1] 500   6
#> 
#> $meta$was_sampled
#> [1] FALSE
#> 
#> $meta$detectors_run
#> NULL
#> 
#> $meta$timestamp
#> [1] "2025-10-22 10:43:35 CEST"
#> 
#> $meta$config_used
#> $meta$config_used$sample_size
#> [1] 50000
#> 
#> $meta$config_used$correlation_threshold
#> [1] 0.8
#> 
#> $meta$config_used$contamination_threshold
#> [1] 0.1
#> 
#> $meta$config_used$numeric_severity
#> [1] TRUE
#> 
#> $meta$config_used$plot_results
#> [1] FALSE
#> 
#> $meta$config_used$parallel
#> [1] FALSE
#> 
#> $meta$config_used$seed
#> [1] 123
#> 
#> 
#> 
#> attr(,"class")
#> [1] "leakr_report"

Financial Data with Temporal Issues

# Create financial data with temporal leakage
set.seed(789)
dates <- seq(as.Date("2020-01-01"), as.Date("2023-12-31"), by = "month")

financial_data <- data.frame(
  account_id = 1:200,
  transaction_date = sample(dates, 200, replace = TRUE),
  amount = rlnorm(200, 4, 1),
  account_balance = rnorm(200, 1000, 500),
  default_risk = factor(sample(c("low", "high"), 200, replace = TRUE))
)

# Sort by date
financial_data <- financial_data[order(financial_data$transaction_date), ]

# Add feature that uses future information (credit score after default assessment)
financial_data$credit_score_updated <- ifelse(
  financial_data$default_risk == "high",
  rnorm(sum(financial_data$default_risk == "high"), 450, 50),
  rnorm(sum(financial_data$default_risk == "low"), 750, 75)
)

# Create temporal split
financial_data$split <- ifelse(
  financial_data$transaction_date < as.Date("2022-01-01"), 
  "train", 
  "test"
)

# Audit financial data
financial_report <- leakr_audit(
  data = financial_data,
  target = "default_risk",
  split = "split",
  id = "account_id"
)

print(financial_report)
#> $summary
#> data frame with 0 columns and 0 rows
#> 
#> $evidence
#> list()
#> 
#> $meta
#> $meta$n_detectors
#> [1] 0
#> 
#> $meta$n_issues
#> [1] 0
#> 
#> $meta$data_shape
#> [1] 200   7
#> 
#> $meta$original_data_shape
#> [1] 200   7
#> 
#> $meta$was_sampled
#> [1] FALSE
#> 
#> $meta$detectors_run
#> NULL
#> 
#> $meta$timestamp
#> [1] "2025-10-22 10:43:35 CEST"
#> 
#> $meta$config_used
#> $meta$config_used$sample_size
#> [1] 50000
#> 
#> $meta$config_used$correlation_threshold
#> [1] 0.8
#> 
#> $meta$config_used$contamination_threshold
#> [1] 0.1
#> 
#> $meta$config_used$numeric_severity
#> [1] TRUE
#> 
#> $meta$config_used$plot_results
#> [1] FALSE
#> 
#> $meta$config_used$parallel
#> [1] FALSE
#> 
#> $meta$config_used$seed
#> [1] 123
#> 
#> 
#> 
#> attr(,"class")
#> [1] "leakr_report"

Advanced Duplication Detection

Near-Duplicate Detection in Customer Data

# Create customer dataset with near-duplicates
set.seed(321)

# Original customers
customers <- data.frame(
  name = c("John Smith", "Jane Doe", "Bob Johnson", "Alice Brown", "Charlie Davis"),
  email = c("john@email.com", "jane@email.com", "bob@email.com", "alice@email.com", "charlie@email.com"),
  age = c(35, 28, 42, 31, 39),
  income = c(50000, 45000, 75000, 55000, 62000),
  purchase_category = factor(c("electronics", "books", "clothing", "electronics", "books"))
)

# Create near-duplicates with slight variations
near_dupes <- customers[1:3, ]
near_dupes$name <- c("J Smith", "Jane D", "Robert Johnson")  # Name variations
near_dupes$email <- c("john.smith@email.com", "j.doe@email.com", "bob.johnson@email.com")  # Email variations
near_dupes$age <- near_dupes$age + c(1, 0, -1)  # Age variations

# Combine datasets
all_customers <- rbind(customers, near_dupes)
all_customers$customer_id <- 1:nrow(all_customers)

# Audit for duplicates
dup_report <- leakr_audit(
  data = all_customers,
  target = "purchase_category",
  id = "customer_id"
)

print(dup_report)
#> $summary
#> data frame with 0 columns and 0 rows
#> 
#> $evidence
#> list()
#> 
#> $meta
#> $meta$n_detectors
#> [1] 0
#> 
#> $meta$n_issues
#> [1] 0
#> 
#> $meta$data_shape
#> [1] 8 6
#> 
#> $meta$original_data_shape
#> [1] 8 6
#> 
#> $meta$was_sampled
#> [1] FALSE
#> 
#> $meta$detectors_run
#> NULL
#> 
#> $meta$timestamp
#> [1] "2025-10-22 10:43:35 CEST"
#> 
#> $meta$config_used
#> $meta$config_used$sample_size
#> [1] 50000
#> 
#> $meta$config_used$correlation_threshold
#> [1] 0.8
#> 
#> $meta$config_used$contamination_threshold
#> [1] 0.1
#> 
#> $meta$config_used$numeric_severity
#> [1] TRUE
#> 
#> $meta$config_used$plot_results
#> [1] FALSE
#> 
#> $meta$config_used$parallel
#> [1] FALSE
#> 
#> $meta$config_used$seed
#> [1] 123
#> 
#> 
#> 
#> attr(,"class")
#> [1] "leakr_report"

Configuration and Customisation

Custom Configuration Options

# Example of custom configuration for sensitive detection
sensitive_config <- list(
  sample_size = 5000,           # Limit sample size for large datasets
  correlation_threshold = 0.7,   # Lower threshold for correlation-based detection
  duplicate_threshold = 0.9      # Threshold for considering records as duplicates
)

# Apply custom configuration
iris_sensitive <- leakr_audit(
  data = iris,
  target = "Species",
  config = sensitive_config
)

print(iris_sensitive)
#> $summary
#> data frame with 0 columns and 0 rows
#> 
#> $evidence
#> list()
#> 
#> $meta
#> $meta$n_detectors
#> [1] 0
#> 
#> $meta$n_issues
#> [1] 0
#> 
#> $meta$data_shape
#> [1] 150   5
#> 
#> $meta$original_data_shape
#> [1] 150   5
#> 
#> $meta$was_sampled
#> [1] FALSE
#> 
#> $meta$detectors_run
#> NULL
#> 
#> $meta$timestamp
#> [1] "2025-10-22 10:43:36 CEST"
#> 
#> $meta$config_used
#> $meta$config_used$sample_size
#> [1] 5000
#> 
#> $meta$config_used$correlation_threshold
#> [1] 0.7
#> 
#> $meta$config_used$contamination_threshold
#> [1] 0.1
#> 
#> $meta$config_used$numeric_severity
#> [1] TRUE
#> 
#> $meta$config_used$plot_results
#> [1] FALSE
#> 
#> $meta$config_used$parallel
#> [1] FALSE
#> 
#> $meta$config_used$seed
#> [1] 123
#> 
#> $meta$config_used$duplicate_threshold
#> [1] 0.9
#> 
#> 
#> 
#> attr(,"class")
#> [1] "leakr_report"

Working with Large Datasets

Stratified Sampling for Balanced Analysis

# Create a large imbalanced dataset
set.seed(555)
large_n <- 10000

large_data <- data.frame(
  feature1 = rnorm(large_n),
  feature2 = sample(letters[1:10], large_n, replace = TRUE),
  feature3 = rnorm(large_n, 100, 20),
  # Imbalanced target
  target = factor(sample(c("rare", "common"), large_n, replace = TRUE, prob = c(0.05, 0.95)))
)

# Use stratified sampling to ensure representation
sample_indices <- stratified_sample(large_data$target, 1000)
sampled_data <- large_data[sample_indices, ]

# Verify sampling maintained class balance
table(large_data$target)
#> 
#> common   rare 
#>   9504    496
table(sampled_data$target)
#> 
#> common   rare 
#>    950     50

# Audit sampled data
large_report <- leakr_audit(
  data = sampled_data,
  target = "target"
)

print(large_report)
#> $summary
#> data frame with 0 columns and 0 rows
#> 
#> $evidence
#> list()
#> 
#> $meta
#> $meta$n_detectors
#> [1] 0
#> 
#> $meta$n_issues
#> [1] 0
#> 
#> $meta$data_shape
#> [1] 1000    4
#> 
#> $meta$original_data_shape
#> [1] 1000    4
#> 
#> $meta$was_sampled
#> [1] FALSE
#> 
#> $meta$detectors_run
#> NULL
#> 
#> $meta$timestamp
#> [1] "2025-10-22 10:43:36 CEST"
#> 
#> $meta$config_used
#> $meta$config_used$sample_size
#> [1] 50000
#> 
#> $meta$config_used$correlation_threshold
#> [1] 0.8
#> 
#> $meta$config_used$contamination_threshold
#> [1] 0.1
#> 
#> $meta$config_used$numeric_severity
#> [1] TRUE
#> 
#> $meta$config_used$plot_results
#> [1] FALSE
#> 
#> $meta$config_used$parallel
#> [1] FALSE
#> 
#> $meta$config_used$seed
#> [1] 123
#> 
#> 
#> 
#> attr(,"class")
#> [1] "leakr_report"

Advanced Reporting and Analysis

Detailed Report Analysis

# Create complex dataset for comprehensive analysis
complex_data <- data.frame(
  id = 1:300,
  timestamp = seq(as.POSIXct("2023-01-01"), as.POSIXct("2023-12-31"), length.out = 300),
  feature_a = rnorm(300),
  feature_b = sample(LETTERS[1:5], 300, replace = TRUE),
  feature_c = rnorm(300, 50, 10),
  outcome = factor(sample(c("success", "failure"), 300, replace = TRUE))
)

# Add intentional leakage for demonstration
complex_data$leaky_feature <- ifelse(complex_data$outcome == "success", 1, 0)

# Generate comprehensive audit
detailed_report <- leakr_audit(
  data = complex_data,
  target = "outcome",
  id = "id"
)

# Generate detailed summary
detailed_summary <- leakr_summarise(detailed_report, top_n = 10, show_config = TRUE)
#> Leakage Audit Report
#> ===================
#> Data shape: 300 x 7 
#> Detectors run:  
#> Timestamp: 2025-10-22 10:43:36 
#> 
#> ✓ No leakage issues detected.
print(detailed_summary)
#> data frame with 0 columns and 0 rows

Best Practices for Advanced Usage

1. Multi-Stage Validation

Implement a systematic approach to leakage detection:

# Multi-stage validation function
comprehensive_validation <- function(data, target, id = NULL, split = NULL) {
  
  cat("Stage 1: Basic data validation\n")
  # Basic preprocessing and validation
  clean_data <- validate_and_preprocess_data(data, target, split, id)
  
  cat("Stage 2: Initial leakage screening\n")
  # Quick initial screening
  initial_report <- leakr_audit(clean_data, target = target, split = split, id = id)
  
  cat("Stage 3: Detailed analysis\n")
  # Generate detailed summary
  summary_report <- leakr_summarise(initial_report, top_n = 15, show_config = TRUE)
  
  # Count critical issues
  if(length(initial_report$issues) > 0) {
    critical_count <- sum(sapply(initial_report$issues, function(x) 
      !is.null(x$severity) && x$severity == "high"))
    
    if(critical_count > 0) {
      cat("WARNING:", critical_count, "critical issues detected!\n")
    }
  }
  
  return(list(
    data = clean_data,
    audit = initial_report,
    summary = summary_report
  ))
}

# Example usage
# validation_result <- comprehensive_validation(your_data, "target_column")

2. Domain-Specific Validation

Adapt validation to your specific domain:

# Example: E-commerce specific validation
ecommerce_validation <- function(data, target) {
  
  # Standard audit
  base_report <- leakr_audit(data, target = target)
  
  # Domain-specific checks
  issues <- list()
  
  # Check for post-purchase features
  post_purchase_patterns <- c("return", "refund", "satisfaction", "rating")
  feature_names <- names(data)
  
  for(pattern in post_purchase_patterns) {
    matching_features <- grep(pattern, feature_names, value = TRUE, ignore.case = TRUE)
    if(length(matching_features) > 0) {
      issues <- append(issues, paste("Potential post-purchase feature:", 
                                   paste(matching_features, collapse = ", ")))
    }
  }
  
  if(length(issues) > 0) {
    cat("Domain-specific warnings:\n")
    for(issue in issues) {
      cat("-", issue, "\n")
    }
  }
  
  return(base_report)
}

# Example e-commerce data
ecommerce_data <- data.frame(
  customer_id = 1:100,
  purchase_amount = rlnorm(100, 4, 1),
  product_category = sample(c("electronics", "books", "clothing"), 100, replace = TRUE),
  customer_satisfaction = sample(1:5, 100, replace = TRUE),  # Post-purchase!
  will_repurchase = factor(sample(c("yes", "no"), 100, replace = TRUE))
)

# Validate e-commerce data
ecommerce_report <- ecommerce_validation(ecommerce_data, "will_repurchase")
#> Domain-specific warnings:
#> - Potential post-purchase feature: customer_satisfaction

Summary

This vignette has demonstrated advanced leakage detection techniques including:

Complex leakage scenarios in medical and financial domains
Near-duplicate detection for customer data
Configuration customisation for different sensitivity requirements
Large dataset handling with stratified sampling
Multi-stage validation workflows
Domain-specific validation approaches

The key to effective leakage detection is understanding your data domain and systematically applying appropriate detection techniques. leakr provides the flexibility to adapt these techniques to your specific requirements whilst maintaining robust detection capabilities.

For integration with popular ML frameworks, see the “Framework Integration” vignette.