#' Title GenePopStats: Population Genetics Statistics for Selective Sweep
#'
#'
#' @param vcf_file Enter the path to VCF File
#' @param window_size Length of the genomic region
#' @param step_size Length of slide in the window in forward direction
#'
#' @return Results are being obtained as a matrix containing 5 Population Summary Statistics such as "Pi", "Wattersons_theta", "Tajima_D", "Kelly_ZnS" and "Omega" Statistics subdivided into windows as specified chromosomal region
#' @export
#'
#' @examples
#'
#' path<-system.file("exdata", "ExampleVCF.vcf", package = "GenePopStats")
#' GenePopStats(path, 20, 10)
#'
#'
#'
#' @importFrom vcfR read.vcfR
#' @importFrom vcfR extract.gt
#' @importFrom vcfR getPOS
#' @importFrom stats cor
#'
#'
#'
#'
#'
#'
#'
#' @export
#'
GenePopStats <- function(vcf_file, window_size, step_size) {
    calc_pi <- function(genomat) {
    n <- ncol(genomat)
    if (n < 2) return(NA)

    diffs <- 0
    comparisons <- 0

    for (i in 1:(n - 1)) {
      for (j in (i + 1):n) {
        valid <- !is.na(genomat[, i]) & !is.na(genomat[, j])
        if (any(valid)) {
          diffs <- diffs + sum(genomat[valid, i] != genomat[valid, j])
          comparisons <- comparisons + sum(valid)
        }
      }
    }
    if (comparisons == 0) return(NA)
    return(diffs / comparisons)
  }

  # Watterson's theta
  calc_thetaW <- function(genomat) {
    n <- ncol(genomat)
    if (n < 2) return(NA)
    S <- sum(apply(genomat, 1, function(x) {
      alleles <- unique(x[!is.na(x)])
      length(alleles) > 1
    }))
    if (S == 0) return(0)
    a1 <- sum(1 / (1:(n - 1)))
    return(S / a1)
  }

  # Tajima's D
  calc_tajimaD <- function(genomat) {
    n <- ncol(genomat)
    if (n < 2) return(NA)

    S <- sum(apply(genomat, 1, function(x) {
      alleles <- unique(x[!is.na(x)])
      length(alleles) > 1
    }))
    if (S == 0) return(NA)

    pi_val <- calc_pi(genomat)
    a1 <- sum(1 / (1:(n - 1)))
    a2 <- sum(1 / ((1:(n - 1))^2))
    b1 <- (n + 1) / (3 * (n - 1))
    b2 <- (2 * (n^2 + n + 3)) / (9 * n * (n - 1))
    c1 <- b1 - (1 / a1)
    c2 <- b2 - ((n + 2) / (a1 * n)) + (a2 / (a1^2))
    e1 <- c1 / a1
    e2 <- c2 / (a1^2 + a2)

    thetaW <- S / a1
    var_d <- e1 * S + e2 * S * (S - 1)
    if (var_d <= 0) return(NA)

    D <- (pi_val - thetaW) / sqrt(var_d)
    return(D)
  }

  # Kelly's ZnS
  calc_ZnS <- function(genomat) {
    if (nrow(genomat) < 2) return(NA)

    seg_sites <- genomat[apply(genomat, 1, function(x) {
      alleles <- unique(x[!is.na(x)])
      length(alleles) > 1
    }), , drop = FALSE]
    if (nrow(seg_sites) < 2) return(NA)

    r2_vals <- c()
    for (i in 1:(nrow(seg_sites) - 1)) {
      for (j in (i + 1):nrow(seg_sites)) {
        x <- seg_sites[i, ]
        y <- seg_sites[j, ]
        valid <- !is.na(x) & !is.na(y)
        if (sum(valid) > 2) {
          r <- suppressWarnings(cor(x[valid], y[valid]))
          if (!is.na(r)) r2_vals <- c(r2_vals, r^2)
        }
      }
    }
    if (length(r2_vals) == 0) return(NA)
    mean(r2_vals)
  }

  # Omega statistic
  calc_omega <- function(genomat) {
    if (nrow(genomat) < 3) return(NA)

    seg_sites <- genomat[apply(genomat, 1, function(x) {
      alleles <- unique(x[!is.na(x)])
      length(alleles) > 1
    }), , drop = FALSE]
    m <- nrow(seg_sites)
    if (m < 3) return(NA)


    best <- -Inf
    for (k in 2:(m - 1)) {
      left <- seg_sites[1:k, , drop = FALSE]
      right <- seg_sites[(k + 1):m, , drop = FALSE]
      ZnS_left <- calc_ZnS(left)
      ZnS_right <- calc_ZnS(right)
      if (!is.na(ZnS_left) && !is.na(ZnS_right)) {
        diff_val <- abs(ZnS_left - ZnS_right)
        if (diff_val > best) best <- diff_val
      }
    }
    if (best == -Inf) return(NA)
    return(best)
  }
  vcf <- vcfR::read.vcfR(vcf_file, verbose = FALSE)
  gt <- vcfR::extract.gt(vcf, element = "GT", as.numeric = FALSE)
  pos <- as.numeric(vcfR::getPOS(vcf))

  gt_mat <- apply(gt, 2, function(g) {
    ifelse(g == "0/0", 0,
           ifelse(g %in% c("0/1", "1/0"), 1,
                  ifelse(g == "1/1", 2, NA)))
  })
  if (is.null(dim(gt_mat))) stop("No usable genotype data found in VCF.")

  df <- data.frame()
  max_pos <- max(pos, na.rm = TRUE)
  windows <- seq(1, max_pos, by = step_size)

  for (w_start in windows) {
    w_end <- w_start + window_size - 1
    idx <- which(pos >= w_start & pos <= w_end)
    if (length(idx) < 2) next

    sub_gt <- gt_mat[idx, , drop = FALSE]

    pi_val <- tryCatch(calc_pi(sub_gt), error = function(e) NA)
    thetaW_val <- tryCatch(calc_thetaW(sub_gt), error = function(e) NA)
    tajD_val <- tryCatch(calc_tajimaD(sub_gt), error = function(e) NA)
    ZnS_val <- tryCatch(calc_ZnS(sub_gt), error = function(e) NA)
    Omega_val <- tryCatch(calc_omega(sub_gt), error = function(e) NA)

    df <- rbind(df, data.frame(
      window = paste0(w_start, "-", w_end),
      start = w_start,
      end = w_end,
      Pi = pi_val,
      ThetaW = thetaW_val,
      TajD = tajD_val,
      ZnS = ZnS_val,
      Omega = Omega_val
    ))
  }

  return(df)
}
