OdysseusCharacterizationModule — Eunomia Walkthrough

This vignette demonstrates every major feature of OdysseusCharacterizationModule using the Eunomia synthetic OMOP CDM database.

Prerequisites

for(package in c("DatabaseConnector", "Eunomia")) {
  if (!requireNamespace(package, quietly = TRUE)) {
    install.packages(package)
  }
}

library(OdysseusCharacterizationModule)
library(DatabaseConnector)
library(Eunomia)

1. Connect to Eunomia and create cohorts

Eunomia ships four built-in cohorts — Celecoxib (id = 1), Diclofenac (id = 2), GiBleed (id = 3), and NSAIDs (id = 4).


connectionDetails <- getEunomiaConnectionDetails()
Eunomia::createCohorts(connectionDetails)
connection <- connect(connectionDetails)

Verify the cohort table:

cohortCounts <- querySql(connection, "
  SELECT cohort_definition_id, COUNT(*) AS cnt
  FROM main.cohort
  GROUP BY cohort_definition_id
  ORDER BY cohort_definition_id
")
cohortCounts

We will characterise the Celecoxib new-user cohort (id = 1) throughout this vignette.

COHORT_ID   <- 1L
CDM_SCHEMA  <- "main"
COHORT_TBL  <- "cohort"
TEMP_SCHEMA <- "main"   # SQLite temp-table emulation

2. Define analysis windows

windows <- defineAnalysisWindows(
  startDays = c(-365, 1),
  endDays   = c(  -1, 365)
)
windows

3. Base feature — Condition Occurrence (start type)

The simplest case: one domain, start-date logic, aggregated.

plan_cond <- planAnalysis(
  analysisWindows = windows,
  useBaseFeatures = list(
    condition_occurrence = list(include = TRUE, type = "start"),
    condition_era        = list(include = FALSE),
    drug_exposure        = list(include = FALSE),
    drug_era             = list(include = FALSE),
    procedure_occurrence = list(include = FALSE),
    observation          = list(include = FALSE),
    device_exposure      = list(include = FALSE),
    visit_occurrence     = list(include = FALSE),
    measurement          = list(include = FALSE)
  ),
  useCohortFeatures     = list(include = FALSE),
  useConceptSetFeatures = list(include = FALSE)
)

specs_cond <- singleNodeSetting(
  plan                     = plan_cond,
  cohortId                 = COHORT_ID,
  cohortDatabaseSchema     = CDM_SCHEMA,
  cohortTable              = COHORT_TBL,
  cdmDatabaseSchema        = CDM_SCHEMA,
  vocabularyDatabaseSchema = CDM_SCHEMA,
  aggregated               = TRUE
)

cat("Specs generated:", length(specs_cond), "\n")

results_cond <- executeSpecs(
  connection, specs_cond,
  tempEmulationSchema = TEMP_SCHEMA,
  cleanTempTables     = TRUE
)

head(results_cond[["1001"]], 10)

4. Base feature — Drug Exposure

plan_drug <- planAnalysis(
  analysisWindows = windows,
  useBaseFeatures = list(
    condition_occurrence = list(include = FALSE),
    condition_era        = list(include = FALSE),
    drug_exposure        = list(include = TRUE),
    drug_era             = list(include = FALSE),
    procedure_occurrence = list(include = FALSE),
    observation          = list(include = FALSE),
    device_exposure      = list(include = FALSE),
    visit_occurrence     = list(include = FALSE),
    measurement          = list(include = FALSE)
  ),
  useCohortFeatures     = list(include = FALSE),
  useConceptSetFeatures = list(include = FALSE)
)

specs_drug <- singleNodeSetting(
  plan                     = plan_drug,
  cohortId                 = COHORT_ID,
  cohortDatabaseSchema     = CDM_SCHEMA,
  cohortTable              = COHORT_TBL,
  cdmDatabaseSchema        = CDM_SCHEMA,
  vocabularyDatabaseSchema = CDM_SCHEMA,
  aggregated               = TRUE
)

results_drug <- executeSpecs(
  connection, specs_drug,
  tempEmulationSchema = TEMP_SCHEMA,
  cleanTempTables     = TRUE
)

head(results_drug[["1001"]], 10)

5. Base feature — Condition Era (overlap type)

Overlap logic checks whether the era period overlaps the analysis window, rather than simply checking the start date.

plan_era <- planAnalysis(
  analysisWindows = windows,
  useBaseFeatures = list(
    condition_occurrence = list(include = FALSE),
    condition_era        = list(include = TRUE, type = "overlap"),
    drug_exposure        = list(include = FALSE),
    drug_era             = list(include = FALSE),
    procedure_occurrence = list(include = FALSE),
    observation          = list(include = FALSE),
    device_exposure      = list(include = FALSE),
    visit_occurrence     = list(include = FALSE),
    measurement          = list(include = FALSE)
  ),
  useCohortFeatures     = list(include = FALSE),
  useConceptSetFeatures = list(include = FALSE)
)

specs_era <- singleNodeSetting(
  plan                     = plan_era,
  cohortId                 = COHORT_ID,
  cohortDatabaseSchema     = CDM_SCHEMA,
  cohortTable              = COHORT_TBL,
  cdmDatabaseSchema        = CDM_SCHEMA,
  vocabularyDatabaseSchema = CDM_SCHEMA,
  aggregated               = TRUE
)

results_era <- executeSpecs(
  connection, specs_era,
  tempEmulationSchema = TEMP_SCHEMA,
  cleanTempTables     = TRUE
)

head(results_era[["1001"]], 10)

6. Base feature — Drug Era (overlap type)

plan_dera <- planAnalysis(
  analysisWindows = windows,
  useBaseFeatures = list(
    condition_occurrence = list(include = FALSE),
    condition_era        = list(include = FALSE),
    drug_exposure        = list(include = FALSE),
    drug_era             = list(include = TRUE, type = "overlap"),
    procedure_occurrence = list(include = FALSE),
    observation          = list(include = FALSE),
    device_exposure      = list(include = FALSE),
    visit_occurrence     = list(include = FALSE),
    measurement          = list(include = FALSE)
  ),
  useCohortFeatures     = list(include = FALSE),
  useConceptSetFeatures = list(include = FALSE)
)

specs_dera <- singleNodeSetting(
  plan                     = plan_dera,
  cohortId                 = COHORT_ID,
  cohortDatabaseSchema     = CDM_SCHEMA,
  cohortTable              = COHORT_TBL,
  cdmDatabaseSchema        = CDM_SCHEMA,
  vocabularyDatabaseSchema = CDM_SCHEMA,
  aggregated               = TRUE
)

results_dera <- executeSpecs(
  connection, specs_dera,
  tempEmulationSchema = TEMP_SCHEMA,
  cleanTempTables     = TRUE
)

head(results_dera[["1001"]], 10)

7. Base feature — Procedure Occurrence

plan_proc <- planAnalysis(
  analysisWindows = windows,
  useBaseFeatures = list(
    condition_occurrence = list(include = FALSE),
    condition_era        = list(include = FALSE),
    drug_exposure        = list(include = FALSE),
    drug_era             = list(include = FALSE),
    procedure_occurrence = list(include = TRUE, type = "start"),
    observation          = list(include = FALSE),
    device_exposure      = list(include = FALSE),
    visit_occurrence     = list(include = FALSE),
    measurement          = list(include = FALSE)
  ),
  useCohortFeatures     = list(include = FALSE),
  useConceptSetFeatures = list(include = FALSE)
)

specs_proc <- singleNodeSetting(
  plan                     = plan_proc,
  cohortId                 = COHORT_ID,
  cohortDatabaseSchema     = CDM_SCHEMA,
  cohortTable              = COHORT_TBL,
  cdmDatabaseSchema        = CDM_SCHEMA,
  vocabularyDatabaseSchema = CDM_SCHEMA,
  aggregated               = TRUE
)

results_proc <- executeSpecs(
  connection, specs_proc,
  tempEmulationSchema = TEMP_SCHEMA,
  cleanTempTables     = TRUE
)

head(results_proc[["1001"]], 10)

8. Base feature — Measurement

plan_meas <- planAnalysis(
  analysisWindows = windows,
  useBaseFeatures = list(
    condition_occurrence = list(include = FALSE),
    condition_era        = list(include = FALSE),
    drug_exposure        = list(include = FALSE),
    drug_era             = list(include = FALSE),
    procedure_occurrence = list(include = FALSE),
    observation          = list(include = FALSE),
    device_exposure      = list(include = FALSE),
    visit_occurrence     = list(include = FALSE),
    measurement          = list(include = TRUE)
  ),
  useCohortFeatures     = list(include = FALSE),
  useConceptSetFeatures = list(include = FALSE)
)

specs_meas <- singleNodeSetting(
  plan                     = plan_meas,
  cohortId                 = COHORT_ID,
  cohortDatabaseSchema     = CDM_SCHEMA,
  cohortTable              = COHORT_TBL,
  cdmDatabaseSchema        = CDM_SCHEMA,
  vocabularyDatabaseSchema = CDM_SCHEMA,
  aggregated               = TRUE
)

results_meas <- executeSpecs(
  connection, specs_meas,
  tempEmulationSchema = TEMP_SCHEMA,
  cleanTempTables     = TRUE
)

head(results_meas[["1001"]], 10)

9. Base feature — Observation

plan_obs <- planAnalysis(
  analysisWindows = windows,
  useBaseFeatures = list(
    condition_occurrence = list(include = FALSE),
    condition_era        = list(include = FALSE),
    drug_exposure        = list(include = FALSE),
    drug_era             = list(include = FALSE),
    procedure_occurrence = list(include = FALSE),
    observation          = list(include = TRUE),
    device_exposure      = list(include = FALSE),
    visit_occurrence     = list(include = FALSE),
    measurement          = list(include = FALSE)
  ),
  useCohortFeatures     = list(include = FALSE),
  useConceptSetFeatures = list(include = FALSE)
)

specs_obs <- singleNodeSetting(
  plan                     = plan_obs,
  cohortId                 = COHORT_ID,
  cohortDatabaseSchema     = CDM_SCHEMA,
  cohortTable              = COHORT_TBL,
  cdmDatabaseSchema        = CDM_SCHEMA,
  vocabularyDatabaseSchema = CDM_SCHEMA,
  aggregated               = TRUE
)

results_obs <- executeSpecs(
  connection, specs_obs,
  tempEmulationSchema = TEMP_SCHEMA,
  cleanTempTables     = TRUE
)

head(results_obs[["1001"]])

10. Base feature — Visit Occurrence (overlap type)

plan_visit <- planAnalysis(
  analysisWindows = windows,
  useBaseFeatures = list(
    condition_occurrence = list(include = FALSE),
    condition_era        = list(include = FALSE),
    drug_exposure        = list(include = FALSE),
    drug_era             = list(include = FALSE),
    procedure_occurrence = list(include = FALSE),
    observation          = list(include = FALSE),
    device_exposure      = list(include = FALSE),
    visit_occurrence     = list(include = TRUE, type = "overlap"),
    measurement          = list(include = FALSE)
  ),
  useCohortFeatures     = list(include = FALSE),
  useConceptSetFeatures = list(include = FALSE)
)

specs_visit <- singleNodeSetting(
  plan                     = plan_visit,
  cohortId                 = COHORT_ID,
  cohortDatabaseSchema     = CDM_SCHEMA,
  cohortTable              = COHORT_TBL,
  cdmDatabaseSchema        = CDM_SCHEMA,
  vocabularyDatabaseSchema = CDM_SCHEMA,
  aggregated               = TRUE
)

results_visit <- executeSpecs(
  connection, specs_visit,
  tempEmulationSchema = TEMP_SCHEMA,
  cleanTempTables     = TRUE
)

head(results_visit[["1001"]])

11. Non-aggregated (patient-level) output

Setting aggregated = FALSE returns one row per patient-concept pair instead of summing across patients.

specs_raw <- singleNodeSetting(
  plan                     = plan_cond,
  cohortId                 = COHORT_ID,
  cohortDatabaseSchema     = CDM_SCHEMA,
  cohortTable              = COHORT_TBL,
  cdmDatabaseSchema        = CDM_SCHEMA,
  vocabularyDatabaseSchema = CDM_SCHEMA,
  aggregated               = FALSE
)

results_raw <- executeSpecs(
  connection, specs_raw,
  tempEmulationSchema = TEMP_SCHEMA,
  cleanTempTables     = TRUE
)

cat("Patient-level rows (window 1):", nrow(results_raw[["1001"]]), "\n")
head(results_raw[["1001"]], 10)

12. Multiple domains at once

Enable several domains in a single plan for an integrated analysis.

plan_multi <- planAnalysis(
  analysisWindows = windows,
  useBaseFeatures = list(
    condition_occurrence = list(include = TRUE, type = "start"),
    condition_era        = list(include = TRUE, type = "overlap"),
    drug_exposure        = list(include = TRUE),
    drug_era             = list(include = TRUE, type = "overlap"),
    procedure_occurrence = list(include = TRUE),
    observation          = list(include = FALSE),
    device_exposure      = list(include = FALSE),
    visit_occurrence     = list(include = TRUE, type = "overlap"),
    measurement          = list(include = TRUE)
  ),
  useCohortFeatures     = list(include = FALSE),
  useConceptSetFeatures = list(include = FALSE)
)

specs_multi <- singleNodeSetting(
  plan                     = plan_multi,
  cohortId                 = COHORT_ID,
  cohortDatabaseSchema     = CDM_SCHEMA,
  cohortTable              = COHORT_TBL,
  cdmDatabaseSchema        = CDM_SCHEMA,
  vocabularyDatabaseSchema = CDM_SCHEMA,
  aggregated               = TRUE
)

cat("Total specs:", length(specs_multi), "\n")

results_multi <- executeSpecs(
  connection, specs_multi,
  tempEmulationSchema = TEMP_SCHEMA,
  cleanTempTables     = TRUE
)

# Summary across all specs
summary_df <- do.call(rbind, lapply(names(results_multi), function(nm) {
  df <- results_multi[[nm]]
  data.frame(analysis_id = nm, rows = nrow(df), stringsAsFactors = FALSE)
}))
summary_df

13. Cohort features — Using GiBleed cohort as a covariate

Use pre-defined cohorts as binary covariates. Here we test whether Celecoxib patients overlap with the GiBleed cohort.

plan_cohort <- planAnalysis(
  analysisWindows = windows,
  useBaseFeatures = list(
    condition_occurrence = list(include = FALSE),
    condition_era        = list(include = FALSE),
    drug_exposure        = list(include = FALSE),
    drug_era             = list(include = FALSE),
    procedure_occurrence = list(include = FALSE),
    observation          = list(include = FALSE),
    device_exposure      = list(include = FALSE),
    visit_occurrence     = list(include = FALSE),
    measurement          = list(include = FALSE)
  ),
  useCohortFeatures = list(
    include         = TRUE,
    type            = "start",
    cohortIds       = c(3L, 4L),
    cohortNames     = c("GiBleed", "NSAIDs"),
    cohortTable     = "cohort",
    covariateSchema = "main"
  ),
  useConceptSetFeatures = list(include = FALSE)
)

specs_cohort <- singleNodeSetting(
  plan                     = plan_cohort,
  cohortId                 = COHORT_ID,
  cohortDatabaseSchema     = CDM_SCHEMA,
  cohortTable              = COHORT_TBL,
  cdmDatabaseSchema        = CDM_SCHEMA,
  vocabularyDatabaseSchema = CDM_SCHEMA,
  aggregated               = TRUE
)

cat("Cohort feature specs:", length(specs_cohort), "\n")

results_cohort <- executeSpecs(
  connection, specs_cohort,
  tempEmulationSchema = TEMP_SCHEMA,
  cleanTempTables     = TRUE
)

# Show results for every cohort feature spec
for (nm in names(results_cohort)) {
  cat("\n--- Analysis", nm, "---\n")
  print(results_cohort[[nm]])
}

14. Cohort features — Overlap type

plan_coh_ov <- planAnalysis(
  analysisWindows = windows,
  useBaseFeatures = list(
    condition_occurrence = list(include = FALSE),
    condition_era        = list(include = FALSE),
    drug_exposure        = list(include = FALSE),
    drug_era             = list(include = FALSE),
    procedure_occurrence = list(include = FALSE),
    observation          = list(include = FALSE),
    device_exposure      = list(include = FALSE),
    visit_occurrence     = list(include = FALSE),
    measurement          = list(include = FALSE)
  ),
  useCohortFeatures = list(
    include         = TRUE,
    type            = "overlap",
    cohortIds       = c(3L),
    cohortNames     = c("GiBleed"),
    cohortTable     = "cohort",
    covariateSchema = "main"
  ),
  useConceptSetFeatures = list(include = FALSE)
)

specs_coh_ov <- singleNodeSetting(
  plan                     = plan_coh_ov,
  cohortId                 = COHORT_ID,
  cohortDatabaseSchema     = CDM_SCHEMA,
  cohortTable              = COHORT_TBL,
  cdmDatabaseSchema        = CDM_SCHEMA,
  vocabularyDatabaseSchema = CDM_SCHEMA,
  aggregated               = TRUE
)

cat("Overlap flag:", specs_coh_ov[[1]]$overlap, "\n")

results_coh_ov <- executeSpecs(
  connection, specs_coh_ov,
  tempEmulationSchema = TEMP_SCHEMA,
  cleanTempTables     = TRUE
)

for (nm in names(results_coh_ov)) {
  cat("\n--- Analysis", nm, "---\n")
  print(results_coh_ov[[nm]])
}

15. SQL rendering without execution

You can inspect the generated SQL without a database connection using renderSpecSql() and renderAllSpecSql().

sql_default <- renderSpecSql(specs_cond[[1]])
cat("--- SQL Server (default) ---\n")
cat(substr(sql_default, 1, 500), "\n...\n")

Translate to other dialects:

for (dialect in c("postgresql", "redshift", "oracle", "spark")) {
  cat("\n--- Dialect:", dialect, "---\n")
  sql_translated <- renderSpecSql(specs_cond[[1]], targetDialect = dialect)
  cat(substr(sql_translated, 1, 400), "\n...\n")
}

Batch rendering:

all_sql <- renderAllSpecSql(specs_cond)
cat("Number of rendered SQL statements:", length(all_sql), "\n")
cat("Analysis IDs:", paste(names(all_sql), collapse = ", "), "\n")

16. Multiple time windows

The number of specs scales linearly with the number of windows.

windows_8 <- defineAnalysisWindows(
  startDays = c(-365, -180, -90, -30, 1, 31, 91, 181),
  endDays   = c(  -1,  -1,  -1,  -1, 30, 90, 180, 365)
)

plan_8w <- planAnalysis(
  analysisWindows = windows_8,
  useBaseFeatures = list(
    condition_occurrence = list(include = TRUE, type = "start"),
    condition_era        = list(include = FALSE),
    drug_exposure        = list(include = FALSE),
    drug_era             = list(include = FALSE),
    procedure_occurrence = list(include = FALSE),
    observation          = list(include = FALSE),
    device_exposure      = list(include = FALSE),
    visit_occurrence     = list(include = FALSE),
    measurement          = list(include = FALSE)
  ),
  useCohortFeatures     = list(include = FALSE),
  useConceptSetFeatures = list(include = FALSE)
)

specs_8w <- singleNodeSetting(
  plan                     = plan_8w,
  cohortId                 = COHORT_ID,
  cohortDatabaseSchema     = CDM_SCHEMA,
  cohortTable              = COHORT_TBL,
  cdmDatabaseSchema        = CDM_SCHEMA,
  vocabularyDatabaseSchema = CDM_SCHEMA,
  aggregated               = TRUE
)

cat("Specs with 8 windows:", length(specs_8w), "\n")

results_8w <- executeSpecs(
  connection, specs_8w,
  tempEmulationSchema = TEMP_SCHEMA,
  cleanTempTables     = TRUE
)

data.frame(
  analysis_id = names(results_8w),
  rows        = vapply(results_8w, nrow, integer(1))
)

17. Combined — Base + Cohort features in one run

plan_combined <- planAnalysis(
  analysisWindows = windows,
  useBaseFeatures = list(
    condition_occurrence = list(include = TRUE, type = "start"),
    condition_era        = list(include = FALSE),
    drug_exposure        = list(include = TRUE),
    drug_era             = list(include = FALSE),
    procedure_occurrence = list(include = TRUE),
    observation          = list(include = FALSE),
    device_exposure      = list(include = FALSE),
    visit_occurrence     = list(include = FALSE),
    measurement          = list(include = TRUE)
  ),
  useCohortFeatures = list(
    include         = TRUE,
    type            = "start",
    cohortIds       = c(3L),
    cohortNames     = c("GiBleed"),
    cohortTable     = "cohort",
    covariateSchema = "main"
  ),
  useConceptSetFeatures = list(include = FALSE)
)

specs_combined <- singleNodeSetting(
  plan                     = plan_combined,
  cohortId                 = COHORT_ID,
  cohortDatabaseSchema     = CDM_SCHEMA,
  cohortTable              = COHORT_TBL,
  cdmDatabaseSchema        = CDM_SCHEMA,
  vocabularyDatabaseSchema = CDM_SCHEMA,
  aggregated               = TRUE
)

cat("Total specs (4 base domains x 2 windows + 1 cohort x 2 windows):",
    length(specs_combined), "\n")

results_combined <- executeSpecs(
  connection, specs_combined,
  tempEmulationSchema = TEMP_SCHEMA,
  cleanTempTables     = TRUE
)

summary_combined <- do.call(rbind, lapply(names(results_combined), function(nm) {
  df <- results_combined[[nm]]
  data.frame(
    analysis_id = nm,
    source      = if (nrow(df) > 0) "data" else "empty",
    rows        = nrow(df),
    stringsAsFactors = FALSE
  )
}))
summary_combined

18. Characterising a different cohort — Diclofenac

All examples so far used Celecoxib (id = 1). Switching to a different cohort is as simple as changing cohortId.

specs_diclo <- singleNodeSetting(
  plan                     = plan_cond,
  cohortId                 = 2L,
  cohortDatabaseSchema     = CDM_SCHEMA,
  cohortTable              = COHORT_TBL,
  cdmDatabaseSchema        = CDM_SCHEMA,
  vocabularyDatabaseSchema = CDM_SCHEMA,
  aggregated               = TRUE
)

results_diclo <- executeSpecs(
  connection, specs_diclo,
  tempEmulationSchema = TEMP_SCHEMA,
  cleanTempTables     = TRUE
)

cat("Diclofenac condition covariates (pre-index):\n")
head(results_diclo[["1001"]], 10)

19. Error handling with stopOnError = FALSE

When executing many specs, you can continue past failures.

results_safe <- executeSpecs(
  connection, specs_multi,
  tempEmulationSchema = TEMP_SCHEMA,
  cleanTempTables     = TRUE,
  stopOnError         = FALSE
)

failed <- vapply(results_safe, function(df) !is.null(attr(df, "error")), logical(1))
cat("Failed specs:", sum(failed), "/", length(results_safe), "\n")

20. Cleanup

disconnect(connection)

Session info

sessionInfo()