Using Your Own Data

Olatunde D. Akanbi

2025-12-17

library(manureshed)

Overview

The manureshed package includes data from 1987-2016, but you can add your own:

Using Custom WWTP Data

For Years Outside 2007-2016

The package has WWTP data for 2007-2016. For other years, provide your own:

# Use your WWTP files for 2020
results_2020 <- run_builtin_analysis(
  scale = "huc8",
  year = 2020,  # Agricultural data available 1987-2016
  nutrients = "nitrogen",
  include_wwtp = TRUE,
  custom_wwtp_nitrogen = "nitrogen_2021.csv",
  wwtp_load_units = "kg"
)

# 3. Check results
summarize_results(results_custom)
quick_check(results_custom)

# 4. Create maps
nitrogen_map <- map_agricultural_classification(
  results_custom$integrated$nitrogen,
  "nitrogen", "combined_N_class",
  "2021 Nitrogen with Custom WWTP Data"
)

save_plot(nitrogen_map, "custom_analysis_2021.png")

# 5. Save results
save_spatial_data(
  results_custom$integrated$nitrogen,
  "nitrogen_2021_results.rds"
)

This example shows the complete workflow from custom data to final maps and saved results.

Advanced Custom Data Integration

Adding Other Nutrient Sources

You can integrate additional nutrient sources beyond WWTP:

# Example: Adding industrial sources
industrial_data <- data.frame(
  Facility_Name = c("Steel Plant A", "Chemical Plant B", "Food Processor C"),
  Lat = c(41.5, 40.7, 42.1),
  Long = c(-81.7, -82.1, -83.5),
  N_Load_tons = c(50, 75, 30),
  P_Load_tons = c(10, 15, 8),
  Source_Type = "Industrial"
)

# Convert to spatial format
industrial_sf <- st_as_sf(industrial_data, 
                         coords = c("Long", "Lat"), 
                         crs = 4326) %>%
  st_transform(5070)  # Transform to analysis CRS

# Aggregate by spatial units (example for HUC8)
boundaries <- load_builtin_boundaries("huc8")
industrial_aggregated <- wwtp_aggregate_by_boundaries(
  industrial_sf, boundaries, "nitrogen", "huc8"
)

# Add to existing results
# (You would modify the integration functions to include industrial sources)

Working with Different Time Periods

# Create a time series dataset
years_to_analyze <- 2018:2022
time_series_results <- list()

for (year in years_to_analyze) {
  # Check if you have WWTP data for this year
  wwtp_file <- paste0("wwtp_nitrogen_", year, ".csv")
  
  if (file.exists(wwtp_file)) {
    time_series_results[[as.character(year)]] <- run_builtin_analysis(
      scale = "huc8",
      year = year,
      nutrients = "nitrogen",
      include_wwtp = TRUE,
      custom_wwtp_nitrogen = wwtp_file,
      save_outputs = FALSE,
      verbose = FALSE
    )
  } else {
    # Agricultural only if no WWTP data
    time_series_results[[as.character(year)]] <- run_builtin_analysis(
      scale = "huc8", 
      year = year,
      nutrients = "nitrogen",
      include_wwtp = FALSE,
      save_outputs = FALSE,
      verbose = FALSE
    )
  }
}

# Compare results across years
yearly_summary <- do.call(rbind, lapply(names(time_series_results), function(year) {
  result <- time_series_results[[year]]
  classes <- table(result$agricultural$N_class)
  
  data.frame(
    Year = as.numeric(year),
    Total_Units = sum(classes),
    Source_Units = classes["Source"] %||% 0,
    Sink_Units = sum(classes[c("Sink_Deficit", "Sink_Fertilizer")], na.rm = TRUE),
    Excluded_Units = classes["Excluded"] %||% 0
  )
}))

print(yearly_summary)

Custom Agricultural Data

If you have agricultural data from other sources:

# Example: Custom agricultural data format
custom_farm_data <- data.frame(
  County_FIPS = c("39001", "39003", "39005"),
  Manure_N_kg = c(100000, 150000, 200000),
  Manure_P_kg = c(20000, 30000, 40000),
  Fertilizer_N_kg = c(50000, 75000, 100000),
  Fertilizer_P_kg = c(10000, 15000, 20000),
  N_Fixation_kg = c(25000, 40000, 35000),
  Crop_N_Removal_kg = c(80000, 120000, 140000),
  Crop_P_Removal_kg = c(15000, 25000, 30000),
  Cropland_acres = c(45000, 67000, 52000)
)

# Convert to package format
standardized_farm_data <- data.frame(
  ID = custom_farm_data$County_FIPS,
  NAME = paste("County", substr(custom_farm_data$County_FIPS, 3, 5)),
  manure_N = custom_farm_data$Manure_N_kg,
  manure_P = custom_farm_data$Manure_P_kg,
  fertilizer_N = custom_farm_data$Fertilizer_N_kg,
  fertilizer_P = custom_farm_data$Fertilizer_P_kg,
  N_fixation = custom_farm_data$N_Fixation_kg,
  crop_removal_N = custom_farm_data$Crop_N_Removal_kg,
  crop_removal_P = custom_farm_data$Crop_P_Removal_kg,
  cropland = custom_farm_data$Cropland_acres
)

# Apply classifications
custom_classified <- standardized_farm_data %>%
  agri_classify_nitrogen(cropland_threshold = 1234, scale = "county") %>%
  agri_classify_phosphorus(cropland_threshold = 1234, scale = "county")

print(custom_classified)

Data Validation and Quality Control

# Function to validate your custom data
validate_custom_data <- function(data, data_type = "wwtp") {
  
  issues <- list()
  
  if (data_type == "wwtp") {
    # Check required columns
    required_cols <- c("Facility_Name", "Lat", "Long")
    missing_cols <- setdiff(required_cols, names(data))
    if (length(missing_cols) > 0) {
      issues$missing_columns <- missing_cols
    }
    
    # Check coordinate ranges (for US)
    if ("Lat" %in% names(data) && "Long" %in% names(data)) {
      invalid_coords <- sum(data$Lat < 24 | data$Lat > 50 | 
                           data$Long < -125 | data$Long > -66, na.rm = TRUE)
      if (invalid_coords > 0) {
        issues$invalid_coordinates <- invalid_coords
      }
    }
    
    # Check for negative loads
    load_cols <- names(data)[grepl("Load|load", names(data))]
    for (col in load_cols) {
      if (col %in% names(data)) {
        negative_loads <- sum(data[[col]] < 0, na.rm = TRUE)
        if (negative_loads > 0) {
          issues[[paste0("negative_", col)]] <- negative_loads
        }
      }
    }
  }
  
  if (data_type == "agricultural") {
    # Check for negative values in nutrient data
    nutrient_cols <- c("manure_N", "manure_P", "fertilizer_N", "fertilizer_P",
                       "crop_removal_N", "crop_removal_P", "cropland")
    
    for (col in nutrient_cols) {
      if (col %in% names(data)) {
        negative_values <- sum(data[[col]] < 0, na.rm = TRUE)
        if (negative_values > 0) {
          issues[[paste0("negative_", col)]] <- negative_values
        }
      }
    }
  }
  
  if (length(issues) == 0) {
    message("✓ Data validation passed")
  } else {
    message("âš  Data validation issues found:")
    for (issue in names(issues)) {
      message("  ", issue, ": ", issues[[issue]])
    }
  }
  
  return(issues)
}

# Use the validation function
# validate_custom_data(your_wwtp_data, "wwtp")
# validate_custom_data(your_farm_data, "agricultural")

Exporting Results

# Export results in different formats
export_analysis_results <- function(results, output_dir = "exports") {
  
  dir.create(output_dir, showWarnings = FALSE)
  
  # Export agricultural data as CSV
  agri_csv <- st_drop_geometry(results$agricultural)
  write.csv(agri_csv, file.path(output_dir, "agricultural_results.csv"), row.names = FALSE)
  
  # Export as shapefile (if sf package available)
  if (requireNamespace("sf", quietly = TRUE)) {
    st_write(results$agricultural, file.path(output_dir, "agricultural_results.shp"), 
             delete_dsn = TRUE, quiet = TRUE)
  }
  
  # Export WWTP facilities if available
  if ("wwtp" %in% names(results)) {
    for (nutrient in names(results$wwtp)) {
      facility_data <- results$wwtp[[nutrient]]$facility_data
      write.csv(facility_data, 
                file.path(output_dir, paste0("wwtp_", nutrient, "_facilities.csv")), 
                row.names = FALSE)
    }
  }
  
  # Export integrated results if available
  if ("integrated" %in% names(results)) {
    for (nutrient in names(results$integrated)) {
      integrated_csv <- st_drop_geometry(results$integrated[[nutrient]])
      write.csv(integrated_csv, 
                file.path(output_dir, paste0("integrated_", nutrient, "_results.csv")), 
                row.names = FALSE)
    }
  }
  
  message("Results exported to: ", output_dir)
}

# Use the export function
# export_analysis_results(results_custom, "my_exports")

Troubleshooting Common Issues

WWTP Data Issues

# Common WWTP data problems and solutions

# Problem 1: "No valid facilities remaining after cleaning"
# Solution: Check coordinates and load values
wwtp_data <- read.csv("your_wwtp_file.csv")
summary(wwtp_data)  # Look for obvious issues

# Check coordinate ranges
summary(wwtp_data$Latitude)   # Should be ~24-50 for US
summary(wwtp_data$Longitude)  # Should be ~-125 to -66 for US

# Check load values
summary(wwtp_data$Load)  # Should be positive numbers

# Problem 2: Wrong coordinate system
# Solution: Make sure coordinates are in decimal degrees (not UTM)
# Example conversion if needed:
# library(sf)
# points_utm <- st_as_sf(data, coords = c("X_UTM", "Y_UTM"), crs = 32617)
# points_dd <- st_transform(points_utm, 4326)
# coords_dd <- st_coordinates(points_dd)

# Problem 3: Mixed units in same file
# Solution: Standardize units before analysis
standardize_mixed_units <- function(data, load_col, unit_col) {
  data$standardized_load <- ifelse(
    data[[unit_col]] == "kg", data[[load_col]],
    ifelse(data[[unit_col]] == "lbs", data[[load_col]] / 2.20462,
           data[[load_col]] * 907.185)  # assuming tons
  )
  return(data)
}

Agricultural Data Issues

# Common agricultural data problems

# Problem: Impossible nutrient balances
# Solution: Check your units and conversion factors
check_nutrient_balance <- function(data) {
  # Calculate simple checks
  data$total_inputs_N <- data$manure_N + data$fertilizer_N + data$N_fixation
  data$efficiency_N <- data$crop_removal_N / data$total_inputs_N
  
  # Flag suspicious values
  suspicious <- data$efficiency_N > 2.0 | data$efficiency_N < 0.1
  
  if (sum(suspicious, na.rm = TRUE) > 0) {
    message("Warning: ", sum(suspicious, na.rm = TRUE), " units have suspicious N efficiency")
    print(data[suspicious, c("ID", "total_inputs_N", "crop_removal_N", "efficiency_N")])
  }
  
  return(data)
}

# Problem: Missing spatial boundaries
# Solution: Use built-in boundaries or provide your own
# county_boundaries <- load_builtin_boundaries("county")
# your_data_with_boundaries <- merge(your_data, county_boundaries, by.x = "FIPS", by.y = "FIPS")

Getting Help

Common Questions

Q: What format should my WWTP data be in? A: CSV file with columns for facility name, latitude, longitude, annual load, and state. The package can auto-detect most EPA formats.

Q: What units can I use for loads?
A: “kg”, “lbs”, “pounds”, or “tons”. Specify with wwtp_load_units.

Q: Can I use data from other countries? A: The spatial boundaries are US-only, but you could provide your own boundaries and modify the coordinate system.

Q: How do I handle missing data? A: The package excludes facilities with missing coordinates or loads. Clean your data first for best results.

Q: My analysis is running slowly. What can I do? A: Try using a smaller spatial scale (HUC2 < HUC8 < County in terms of processing time), fewer years, or clear the data cache with clear_data_cache().

Q: How do I cite the package and data? A: Use citation_info() to get proper citations for the package and underlying datasets.

Function Help

# Get help on specific functions
?load_user_wwtp
?run_builtin_analysis  
?wwtp_clean_data

# Check what data is available
check_builtin_data()
list_available_years()

# Package health check
health_check()

# Test data download connection
test_osf_connection()

Getting More Help

The manureshed package is designed to work with your data as easily as possible. Start with the built-in examples, then gradually add your own data as needed.nitrogen_2020.csv” )

Handling Different Data Formats

Standard EPA Format

Most EPA WWTP files work automatically:

# Standard EPA format (auto-detected)
results <- run_builtin_analysis(
  scale = "county",
  year = 2018,
  nutrients = c("nitrogen", "phosphorus"),
  include_wwtp = TRUE,
  custom_wwtp_nitrogen = "nitrogen_2018.csv",
  custom_wwtp_phosphorus = "phosphorus_2018.csv"
)

Different Units

# If your data uses pounds instead of kg
results <- run_builtin_analysis(
  scale = "huc8", 
  year = 2019,
  nutrients = "nitrogen",
  include_wwtp = TRUE,
  custom_wwtp_nitrogen = "nitrogen_pounds_2019.csv",
  wwtp_load_units = "lbs"  # Converts automatically
)

# Other units: "kg", "lbs", "pounds", "tons"

Different File Format

# If headers are in different rows
results <- run_builtin_analysis(
  scale = "county",
  year = 2021, 
  nutrients = "nitrogen",
  include_wwtp = TRUE,
  custom_wwtp_nitrogen = "nitrogen_2021.csv",
  wwtp_skip_rows = 3,      # Skip first 3 rows
  wwtp_header_row = 1      # Headers in row 1 after skipping
)

Custom Column Names

# If your columns have different names
custom_mapping <- list(
  facility_name = "Plant_Name",
  latitude = "Lat_DD",
  longitude = "Long_DD", 
  pollutant_load = "Annual_Load_kg",
  state = "State_Code"
)

results <- run_builtin_analysis(
  scale = "huc8",
  year = 2022,
  nutrients = "nitrogen", 
  include_wwtp = TRUE,
  custom_wwtp_nitrogen = "custom_format.csv",
  wwtp_column_mapping = custom_mapping
)

Manual WWTP Processing

For full control, process WWTP data step by step:

# Step 1: Load your WWTP file
wwtp_raw <- load_user_wwtp(
  file_path = "nitrogen_2020.csv",
  nutrient = "nitrogen",
  load_units = "lbs"
)

# Step 2: Clean the data
wwtp_clean <- wwtp_clean_data(wwtp_raw, "nitrogen")

# Step 3: Classify facilities by size
wwtp_classified <- wwtp_classify_sources(wwtp_clean, "nitrogen")

# Step 4: Convert to spatial format
wwtp_spatial <- wwtp_to_spatial(wwtp_classified)

# Step 5: Load boundaries and aggregate by spatial units
boundaries <- load_builtin_boundaries("huc8")
wwtp_aggregated <- wwtp_aggregate_by_boundaries(
  wwtp_spatial, boundaries, "nitrogen", "huc8"
)

# Step 6: Integrate with agricultural data
agri_data <- load_builtin_nugis("huc8", 2020)
agri_processed <- agri_classify_complete(agri_data, "huc8")

integrated <- integrate_wwtp_agricultural(
  agri_processed, wwtp_aggregated, "nitrogen", 
  cropland_threshold = 1234, scale = "huc8"
)

Unit Conversions

Common Conversions

# Convert between units
kg_loads <- c(1000, 2000, 5000)
tons_loads <- convert_load_units(kg_loads, "kg")

lbs_loads <- c(2000, 4000, 10000)  
tons_loads <- convert_load_units(lbs_loads, "lbs")

print(data.frame(
  Original_kg = kg_loads,
  Converted_tons = convert_load_units(kg_loads, "kg")
))

Handling P2O5 vs P

The package automatically converts P2O5 to P, but you can do it manually:

# If you have P2O5 data, convert to P
p2o5_values <- c(100, 200, 300)  # kg P2O5
p_values <- p2o5_values * P2O5_TO_P  # Convert to P

print(paste("P2O5 to P conversion factor:", P2O5_TO_P))

Working with Different Spatial Scales

County Data (FIPS Codes)

# County analysis - make sure you have 5-digit FIPS codes
county_results <- run_builtin_analysis(
  scale = "county",
  year = 2016,
  nutrients = "nitrogen"
)

# Your county WWTP data should have state and county info

HUC8 Watersheds

# HUC8 analysis - 8-digit watershed codes
huc8_results <- run_builtin_analysis(
  scale = "huc8", 
  year = 2016,
  nutrients = "nitrogen"
)

# Make sure HUC8 codes are 8 digits (add leading zero if needed)
huc8_codes <- c("4110001", "4110002")  # 7 digits
formatted_codes <- format_huc8(huc8_codes)  # Adds leading zero
print(formatted_codes)  # "04110001", "04110002"

HUC2 Regions

# HUC2 analysis - 2-digit regional codes
huc2_results <- run_builtin_analysis(
  scale = "huc2",
  year = 2016, 
  nutrients = "nitrogen"
)

State-Specific Analysis

Single State

# Analyze just one state
iowa_results <- run_state_analysis(
  state = "IA",
  scale = "county",
  year = 2016,
  nutrients = "nitrogen",
  include_wwtp = TRUE
)

# With custom WWTP data
texas_results <- run_state_analysis(
  state = "TX",
  scale = "huc8",
  year = 2020,
  nutrients = "nitrogen", 
  include_wwtp = TRUE,
  custom_wwtp_nitrogen = "texas_wwtp_2020.csv"
)

Multiple States

# Analyze several states
midwest_states <- c("IA", "IL", "IN", "OH") 
state_results <- list()

for (state in midwest_states) {
  state_results[[state]] <- run_state_analysis(
    state = state,
    scale = "county",
    year = 2016,
    nutrients = "nitrogen", 
    include_wwtp = TRUE
  )
}

Quality Control

Validate Your Data

# Check your results make sense
quick_check(results)

# Look at the classifications
table(results$agricultural$N_class)

# Check WWTP facility counts
if ("wwtp" %in% names(results)) {
  print(paste("WWTP facilities:", nrow(results$wwtp$nitrogen$facility_data)))
}

Common Data Issues

# Problem: Negative nutrient values
# Solution: Check your data source and units

# Problem: All facilities excluded  
# Solution: Check coordinate system (should be decimal degrees)

# Problem: No WWTP facilities found
# Solution: Check facility coordinates are in correct format

# Problem: Classification doesn't make sense
# Solution: Check cropland threshold and nutrient balance calculations

Working with Multiple Years

Time Series Analysis

# Analyze multiple years
years_to_analyze <- 2014:2016

batch_results <- batch_analysis_years(
  years = years_to_analyze,
  scale = "huc8",
  nutrients = "nitrogen", 
  include_wwtp = TRUE
)

# With custom WWTP data for some years
custom_wwtp_files <- list(
  "2018" = "nitrogen_2018.csv",
  "2019" = "nitrogen_2019.csv", 
  "2020" = "nitrogen_2020.csv"
)

# Process each year with custom data
custom_results <- list()
for (year in names(custom_wwtp_files)) {
  custom_results[[year]] <- run_builtin_analysis(
    scale = "county",
    year = as.numeric(year),
    nutrients = "nitrogen",
    include_wwtp = TRUE,
    custom_wwtp_nitrogen = custom_wwtp_files[[year]]
  )
}

Data Management Tips

File Organization

# Organize your data files
# project_folder/
#   ├── wwtp_data/
#   │   ├── nitrogen_2018.csv
#   │   ├── nitrogen_2019.csv
#   │   └── phosphorus_2018.csv
#   ├── results/
#   └── maps/

# Set working directory
setwd("project_folder")

# Run analysis with organized files
results <- run_builtin_analysis(
  scale = "huc8",
  year = 2018,
  nutrients = "nitrogen",
  include_wwtp = TRUE,
  custom_wwtp_nitrogen = "wwtp_data/nitrogen_2018.csv",
  output_dir = "results"
)

Memory Management

# For large datasets, clear cache periodically
clear_data_cache()

# Check package health
health_check()

# Free up memory between analyses
rm(large_results)
gc()

Example: Complete Custom Workflow

Here’s a complete example using custom data:

# 1. Prepare your WWTP file (nitrogen_2021.csv)
# Make sure it has columns: Facility Name, Latitude, Longitude, Load (kg/yr), State

# 2. Run analysis
results_custom <- run_builtin_analysis(
  scale = "huc8",
  year = 2021,  # Agricultural data extrapolated to 2021
  nutrients = "nitrogen",
  include_wwtp = TRUE,
  custom_wwtp_nitrogen = "nitrogen_2021.csv",
  wwtp_load_units = "kg"
)

# 3. Check results
summarize_results(results_custom)
quick_check(results_custom)

# 4. Create maps
nitrogen_map <- map_agricultural_classification(
  results_custom$integrated$nitrogen,
  "nitrogen", "combined_N_class",
  "2021 Nitrogen with Custom WWTP Data"
)

save_plot(nitrogen_map, "custom_analysis_2021.png")

# 5. Save results
save_spatial_data(
  results_custom$integrated$nitrogen,
  "nitrogen_2021_results.rds"
)

Integration Issues

# Common integration problems

# Problem: WWTP facilities not matching spatial units
# Solution: Check coordinate systems and boundary files
check_wwtp_spatial_match <- function(wwtp_data, boundaries) {
  
  # Convert WWTP to spatial
  wwtp_sf <- st_as_sf(wwtp_data, coords = c("Long", "Lat"), crs = 4326)
  wwtp_projected <- st_transform(wwtp_sf, st_crs(boundaries))
  
  # Check spatial intersection
  intersections <- st_intersects(wwtp_projected, boundaries)
  
  # Count facilities per spatial unit
  matches <- lengths(intersections)
  
  message("WWTP spatial matching summary:")
  message("  Total facilities: ", nrow(wwtp_data))
  message("  Facilities matched to boundaries: ", sum(matches > 0))
  message("  Facilities not matched: ", sum(matches == 0))
  
  if (sum(matches == 0) > 0) {
    unmatched <- wwtp_data[matches == 0, ]
    message("  Check coordinates for unmatched facilities")
    print(head(unmatched))
  }
  
  return(matches)
}

# Problem: Scale mismatch between data sources
# Solution: Ensure consistent spatial scales
verify_scale_consistency <- function(agricultural_data, wwtp_data, scale) {
  
  message("Scale consistency check for: ", scale)
  
  # Check ID formats
  if (scale == "county") {
    # FIPS codes should be 5 digits
    invalid_fips <- sum(nchar(agricultural_data$ID) != 5)
    message("  Invalid FIPS codes: ", invalid_fips)
  } else if (scale == "huc8") {
    # HUC8 codes should be 8 digits
    invalid_huc8 <- sum(nchar(agricultural_data$ID) != 8)
    message("  Invalid HUC8 codes: ", invalid_huc8)
  }
  
  # Check coordinate ranges
  coord_summary <- list(
    lat_range = range(wwtp_data$Lat, na.rm = TRUE),
    long_range = range(wwtp_data$Long, na.rm = TRUE)
  )
  
  message("  WWTP coordinate ranges:")
  message("    Latitude: ", paste(round(coord_summary$lat_range, 2), collapse = " to "))
  message("    Longitude: ", paste(round(coord_summary$long_range, 2), collapse = " to "))
  
  return(coord_summary)
}

Best Practices for Custom Data

File Organization

# Recommended project structure
create_project_structure <- function(project_name) {
  
  # Create main directories
  dir.create(project_name, showWarnings = FALSE)
  dir.create(file.path(project_name, "data"), showWarnings = FALSE)
  dir.create(file.path(project_name, "data", "wwtp"), showWarnings = FALSE)
  dir.create(file.path(project_name, "data", "agricultural"), showWarnings = FALSE)
  dir.create(file.path(project_name, "results"), showWarnings = FALSE)
  dir.create(file.path(project_name, "maps"), showWarnings = FALSE)
  dir.create(file.path(project_name, "exports"), showWarnings = FALSE)
  
  # Create README
  readme_content <- paste(
    "# Manureshed Analysis Project",
    "",
    "## Data Files",
    "- data/wwtp/ - WWTP facility data files",
    "- data/agricultural/ - Custom agricultural data",
    "",
    "## Outputs", 
    "- results/ - Analysis results (.rds files)",
    "- maps/ - Generated maps (.png files)",
    "- exports/ - Exported data (.csv, .shp files)",
    "",
    "## Analysis Scripts",
    "- analysis.R - Main analysis script",
    "",
    sep = "\n"
  )
  
  writeLines(readme_content, file.path(project_name, "README.md"))
  
  message("Project structure created: ", project_name)
  message("Add your data files to the appropriate folders")
}

# Create organized project
# create_project_structure("my_nutrient_analysis")

Data Documentation

# Document your custom data sources
document_data_sources <- function(wwtp_files = NULL, agricultural_files = NULL, 
                                 output_file = "data_documentation.txt") {
  
  doc_content <- c(
    "DATA SOURCES DOCUMENTATION",
    "============================",
    "",
    "Analysis Date: ", as.character(Sys.Date()),
    "Package Version: ", as.character(packageVersion("manureshed")),
    ""
  )
  
  if (!is.null(wwtp_files)) {
    doc_content <- c(doc_content,
      "WWTP DATA FILES:",
      "----------------"
    )
    
    for (file in wwtp_files) {
      if (file.exists(file)) {
        file_info <- file.info(file)
        doc_content <- c(doc_content,
          paste("File:", file),
          paste("  Size:", round(file_info$size / 1024, 2), "KB"),
          paste("  Modified:", file_info$mtime),
          paste("  Rows:", nrow(read.csv(file))),
          ""
        )
      }
    }
  }
  
  if (!is.null(agricultural_files)) {
    doc_content <- c(doc_content,
      "AGRICULTURAL DATA FILES:",
      "------------------------"
    )
    
    for (file in agricultural_files) {
      if (file.exists(file)) {
        file_info <- file.info(file)
        doc_content <- c(doc_content,
          paste("File:", file),
          paste("  Size:", round(file_info$size / 1024, 2), "KB"),
          paste("  Modified:", file_info$mtime),
          paste("  Rows:", nrow(read.csv(file))),
          ""
        )
      }
    }
  }
  
  doc_content <- c(doc_content,
    "ANALYSIS PARAMETERS:",
    "--------------------",
    "Built-in data years: 1987-2016 (Agricultural), 2007-2016 (WWTP)",
    "Spatial scales: county, huc8, huc2",
    "Default cropland threshold: 500 ha (1,234 acres)",
    ""
  )
  
  writeLines(doc_content, output_file)
  message("Data documentation saved to: ", output_file)
}

# Use documentation function
# document_data_sources(
#   wwtp_files = c("data/wwtp/nitrogen_2020.csv", "data/wwtp/phosphorus_2020.csv"),
#   agricultural_files = c("data/agricultural/custom_farm_data.csv")
# )

Quality Assurance Workflow

# Complete quality assurance workflow
quality_assurance_workflow <- function(results, data_sources = NULL) {
  
  qa_report <- list()
  qa_report$timestamp <- Sys.time()
  qa_report$data_sources <- data_sources
  
  # 1. Basic data checks
  qa_report$basic_checks <- list(
    total_spatial_units = nrow(results$agricultural),
    missing_classifications = sum(is.na(results$agricultural$N_class))
  )
  
  # 2. Classification distribution
  if ("N_class" %in% names(results$agricultural)) {
    n_dist <- table(results$agricultural$N_class)
    qa_report$classification_distribution <- list(
      nitrogen = n_dist,
      excluded_percentage = round(n_dist["Excluded"] / sum(n_dist) * 100, 1)
    )
  }
  
  # 3. WWTP integration checks
  if ("wwtp" %in% names(results)) {
    qa_report$wwtp_checks <- list()
    
    for (nutrient in names(results$wwtp)) {
      facilities <- results$wwtp[[nutrient]]$facility_data
      qa_report$wwtp_checks[[nutrient]] <- list(
        total_facilities = nrow(facilities),
        facilities_with_loads = sum(!is.na(facilities[[paste0(toupper(substr(nutrient, 1, 1)), "_Load_tons")]]))
      )
    }
  }
  
  # 4. Spatial validation
  if (inherits(results$agricultural, "sf")) {
    qa_report$spatial_checks <- list(
      invalid_geometries = sum(!st_is_valid(results$agricultural)),
      crs = st_crs(results$agricultural)$input
    )
  }
  
  # 5. Range checks
  if ("integrated" %in% names(results)) {
    for (nutrient in names(results$integrated)) {
      data <- results$integrated[[nutrient]]
      surplus_col <- paste0(substr(nutrient, 1, 1), "_surplus")
      
      if (surplus_col %in% names(data)) {
        qa_report$range_checks[[nutrient]] <- list(
          min_surplus = min(data[[surplus_col]], na.rm = TRUE),
          max_surplus = max(data[[surplus_col]], na.rm = TRUE),
          extreme_values = sum(abs(data[[surplus_col]]) > 1000, na.rm = TRUE)
        )
      }
    }
  }
  
  # Generate QA summary
  qa_summary <- list(
    passed = TRUE,
    warnings = character(),
    errors = character()
  )
  
  # Check for issues
  if (qa_report$basic_checks$missing_classifications > 0) {
    qa_summary$warnings <- c(qa_summary$warnings, 
                             paste("Missing classifications:", qa_report$basic_checks$missing_classifications))
  }
  
  if (qa_report$classification_distribution$excluded_percentage > 50) {
    qa_summary$warnings <- c(qa_summary$warnings,
                             paste("High exclusion rate:", qa_report$classification_distribution$excluded_percentage, "%"))
  }
  
  if ("spatial_checks" %in% names(qa_report) && qa_report$spatial_checks$invalid_geometries > 0) {
    qa_summary$errors <- c(qa_summary$errors,
                           paste("Invalid geometries:", qa_report$spatial_checks$invalid_geometries))
    qa_summary$passed <- FALSE
  }
  
  # Print summary
  message("Quality Assurance Summary:")
  message("  Status: ", ifelse(qa_summary$passed, "PASSED", "FAILED"))
  
  if (length(qa_summary$warnings) > 0) {
    message("  Warnings:")
    for (warning in qa_summary$warnings) {
      message("    - ", warning)
    }
  }
  
  if (length(qa_summary$errors) > 0) {
    message("  Errors:")
    for (error in qa_summary$errors) {
      message("    - ", error)
    }
  }
  
  qa_report$summary <- qa_summary
  return(qa_report)
}

# Run QA workflow
# qa_results <- quality_assurance_workflow(results_custom, "Custom 2021 WWTP data")

Getting Help

Common Questions

Q: What format should my WWTP data be in? A: CSV file with columns for facility name, latitude, longitude, annual load, and state. The package can auto-detect most EPA formats.

Q: What units can I use for loads?
A: “kg”, “lbs”, “pounds”, or “tons”. Specify with wwtp_load_units.

Q: Can I use data from other countries? A: The spatial boundaries are US-only, but you could provide your own boundaries and modify the coordinate system.

Q: How do I handle missing data? A: The package excludes facilities with missing coordinates or loads. Clean your data first for best results.

Q: My analysis is running slowly. What can I do? A: Try using a smaller spatial scale (HUC2 < HUC8 < County in terms of processing time), fewer years, or clear the data cache with clear_data_cache().

Q: How do I cite the package and data? A: Use citation_info() to get proper citations for the package and underlying datasets.

Q: Can I analyze partial years or seasonal data? A: The package is designed for annual data. For seasonal analysis, you would need to aggregate your data to annual totals first.

Q: What if my WWTP data has different pollutant measurements? A: The package expects total nitrogen and total phosphorus loads. If you have other forms (NO3, NH4, PO4), you’ll need to convert to total N and P first.

Function Help

# Get help on specific functions
?load_user_wwtp
?run_builtin_analysis  
?wwtp_clean_data
?validate_columns

# Check what data is available
check_builtin_data()
list_available_years()

# Package health check
health_check()

# Test data download connection
test_osf_connection()

Getting More Help

Reporting Issues

If you encounter bugs or have feature requests:

  1. Check that your data format matches the expected format
  2. Run health_check() to verify package installation
  3. Try with built-in data first to isolate the issue
  4. Document the error message and steps to reproduce
  5. Check the package documentation and vignettes
  6. Report issues with a minimal reproducible example

The manureshed package is designed to work with your data as easily as possible. Start with the built-in examples, then gradually add your own data as needed. The key is to ensure your data is properly formatted and validated before running the analysis.