diff --git a/Apptainer.def b/Apptainer.def index 44a6d06..e36e100 100644 --- a/Apptainer.def +++ b/Apptainer.def @@ -18,6 +18,9 @@ From: rocker/r2u:jammy set -e apt-get update \ && apt-get install -y --no-install-recommends \ + less \ + python3 \ + python3-pip \ r-bioc-delayedarray \ r-bioc-hdf5array \ r-cran-broom \ @@ -37,9 +40,18 @@ From: rocker/r2u:jammy r-cran-tibble \ r-cran-tidyr \ r-cran-tidyverse \ + r-cran-tiledb \ && apt-get clean \ && echo 'options(bspm.sudo = TRUE)' >> /etc/R/Rprofile.site \ - && rm -rf /var/lib/apt/lists/* + && rm -rf /var/lib/apt/lists/* \ + && pip3 install radian + + # Create .Rprofile in root home to silence bspm warning + mkdir -p /root + echo 'options(bspm.sudo = TRUE)' >> /root/.Rprofile + + # Install TileDBArray from Bioconductor (not available as r-bioc- package) + R -e 'if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager"); BiocManager::install("TileDBArray", update = FALSE, ask = FALSE)' cd /ModelArray R -e 'devtools::install()' @@ -53,7 +65,7 @@ From: rocker/r2u:jammy if [ $# -gt 0 ]; then exec "$@" else - exec R + exec radian fi diff --git a/DESCRIPTION b/DESCRIPTION index e4cc7cb..04a202a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -23,25 +23,25 @@ Depends: R (>= 4.1.2) biocViews: Imports: DelayedArray, - HDF5Array, broom, crayon, doParallel, dplyr, glue, - hdf5r, magrittr, methods, mgcv, parallel, pbapply, pbmcapply, - rhdf5, rlang, tibble, tidyr RoxygenNote: 7.3.1 Suggests: + HDF5Array, + rhdf5, + TileDBArray, rmarkdown, knitr, testthat (>= 3.0.0), diff --git a/NAMESPACE b/NAMESPACE index c88d5ea..2ce30f6 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -27,7 +27,6 @@ import(tibble) importClassesFrom(DelayedArray,DelayedArray) importFrom(DelayedArray,DelayedArray) importFrom(DelayedArray,realize) -importFrom(HDF5Array,HDF5ArraySeed) importFrom(crayon,black) importFrom(dplyr,"%>%") importFrom(dplyr,bind_cols) diff --git a/R/ModelArray_Constructor.R b/R/ModelArray_Constructor.R index 774e47c..8ad10e8 100644 --- a/R/ModelArray_Constructor.R +++ b/R/ModelArray_Constructor.R @@ -154,10 +154,16 @@ ModelArraySeed <- function(filepath, name, type = NA) { #' Then please check if you give correct "scalar_types" - check via #' rhdf5::h5ls(filename_for_h5) #' -#' @param filepath file +#' @param filepath file path or backend root URI #' @param scalar_types expected scalars #' @param analysis_names the subfolder names for results in .h5 file. If empty #' (default), results are not read. +#' @param backend character, storage backend to use. One of "hdf5" (default) or +#' "tiledb". When "tiledb", `filepath` should point to a TileDB group root +#' containing arrays at `scalars//values` (and optionally +#' `scalars//column_names`). Results loading currently requires +#' matching TileDB arrays at `results//results_matrix` and +#' `results//column_names`. #' @return ModelArray object #' @export #' @import methods @@ -166,7 +172,9 @@ ModelArraySeed <- function(filepath, name, type = NA) { #' @importFrom rhdf5 h5readAttributes ModelArray <- function(filepath, scalar_types = c("FD"), - analysis_names = character(0)) { + analysis_names = character(0), + backend = c("hdf5", "tiledb")) { + backend <- match.arg(backend) # TODO: try and use hdf5r instead of rhdf5 and delayedarray here # fn.h5 <- H5File$new(filepath, mode="a") # open; "a": creates a new file or opens an existing one for read/write @@ -185,63 +193,115 @@ ModelArray <- function(filepath, for (x in seq_along(scalar_types)) { # TODO: IT'S BETTER TO CHECK IF THIS SCALAR_TYPE EXISTS OR NOT..... - Chenying - # /scalars//values: - scalar_data[[x]] <- ModelArraySeed( - filepath, - name = sprintf("scalars/%s/values", scalar_types[x]), - type = NA - ) %>% DelayedArray::DelayedArray() - - # load source filenames (column_names): prefer attribute; fallback to dataset - attrs <- rhdf5::h5readAttributes(filepath, name = sprintf("scalars/%s/values", scalar_types[x])) - colnames_attr <- attrs$column_names - if (is.null(colnames_attr)) { - # Fallback: attempt to read from dataset-based column names - # Try multiple plausible locations for compatibility across writers - paths_to_try <- c( - sprintf("scalars/%s/column_names", scalar_types[x]), - sprintf("scalars/%s/values/column_names", scalar_types[x]), - sprintf("scalars/scalars/%s/values/column_names", scalar_types[x]), - sprintf("scalars/scalars/%s/column_names", scalar_types[x]) - ) + if (backend == "hdf5") { + # HDF5 dependencies must be available only when using the HDF5 backend + if (!requireNamespace("HDF5Array", quietly = TRUE)) { + stop("backend='hdf5' requires the HDF5Array package. Please install it.") + } + if (!requireNamespace("rhdf5", quietly = TRUE)) { + stop("backend='hdf5' requires the rhdf5 package. Please install it.") + } + # /scalars//values in HDF5: + scalar_data[[x]] <- ModelArraySeed( + filepath, + name = sprintf("scalars/%s/values", scalar_types[x]), + type = NA + ) %>% DelayedArray::DelayedArray() + + # load source filenames (column_names): prefer attribute; fallback to dataset + attrs <- rhdf5::h5readAttributes(filepath, name = sprintf("scalars/%s/values", scalar_types[x])) + colnames_attr <- attrs$column_names + if (is.null(colnames_attr)) { + # Fallback: attempt to read from dataset-based column names + # Try multiple plausible locations for compatibility across writers + paths_to_try <- c( + sprintf("scalars/%s/column_names", scalar_types[x]), + sprintf("scalars/%s/values/column_names", scalar_types[x]), + sprintf("scalars/scalars/%s/values/column_names", scalar_types[x]), + sprintf("scalars/scalars/%s/column_names", scalar_types[x]) + ) - colnames_ds <- NULL - last_error <- NULL - for (p in paths_to_try) { - tmp <- tryCatch( - { - rhdf5::h5read(filepath, p) - }, - error = function(e) { - last_error <<- e - NULL + colnames_ds <- NULL + last_error <- NULL + for (p in paths_to_try) { + tmp <- tryCatch( + { + rhdf5::h5read(filepath, p) + }, + error = function(e) { + last_error <<- e + NULL + } + ) + if (!is.null(tmp)) { + colnames_ds <- tmp + break } - ) - if (!is.null(tmp)) { - colnames_ds <- tmp - break } + if (is.null(colnames_ds)) { + stop(paste0( + "Neither attribute 'column_names' nor a dataset with column names found. Tried: ", + paste(paths_to_try, collapse = ", "), + if (!is.null(last_error)) paste0(". Last error: ", conditionMessage(last_error)) else "" + )) + } + # Ensure character vector, not list/matrix; trim potential null terminators and whitespace + if (is.list(colnames_ds)) { + colnames_ds <- unlist(colnames_ds, use.names = FALSE) + } + colnames_ds <- as.vector(colnames_ds) + colnames_ds <- as.character(colnames_ds) + # Trim any trailing NULs (hex 00) and surrounding whitespace for cross-language string compatibility + # Use escaped hex in pattern to avoid embedding a NUL in the source code + colnames_ds <- gsub("[\\x00]+$", "", colnames_ds, perl = TRUE, useBytes = TRUE) + colnames_ds <- trimws(colnames_ds) + sources[[x]] <- colnames_ds + } else { + sources[[x]] <- as.character(colnames_attr) } - if (is.null(colnames_ds)) { + } else if (backend == "tiledb") { + # TileDB: open dense array at scalars//values + if (!requireNamespace("TileDBArray", quietly = TRUE)) { + stop("backend='tiledb' requires the TileDBArray package. Please install it.") + } + values_uri <- file.path(filepath, sprintf("scalars/%s/values", scalar_types[x])) + # TileDBArray::TileDBArray() returns a DelayedArray directly + scalar_data[[x]] <- tryCatch( + { + TileDBArray::TileDBArray(values_uri) + }, + error = function(e) { + stop(paste0( + "Error loading TileDB array at '", values_uri, "': ", + conditionMessage(e), + "\nThis may indicate a type mismatch in the TileDB array schema. ", + "The array should have float32 or float64 attribute values with int64 dimensions. ", + "Check the array schema using: tiledb::tiledb_array_schema_load(values_uri)" + )) + } + ) + + # Try to read sources/column names from a sidecar TileDB array if present + # at scalars//column_names (1D) + cn_uri <- file.path(filepath, sprintf("scalars/%s/column_names", scalar_types[x])) + colnames_ds <- NULL + suppressWarnings({ + try({ + cn_arr <- TileDBArray::TileDBArray(cn_uri) + # realize into R vector + colnames_ds <- as.vector(DelayedArray::realize(cn_arr)) + }, silent = TRUE) + }) + if (!is.null(colnames_ds)) { + colnames_ds <- as.character(colnames_ds) + colnames_ds <- trimws(colnames_ds) + sources[[x]] <- colnames_ds + } else { stop(paste0( - "Neither attribute 'column_names' nor a dataset with column names found. Tried: ", - paste(paths_to_try, collapse = ", "), - if (!is.null(last_error)) paste0(". Last error: ", conditionMessage(last_error)) else "" + "TileDB backend requires column names at '", cn_uri, "'. ", + "Please write a 1D TileDB array of subject/source names or supply an HDF5 file." )) } - # Ensure character vector, not list/matrix; trim potential null terminators and whitespace - if (is.list(colnames_ds)) { - colnames_ds <- unlist(colnames_ds, use.names = FALSE) - } - colnames_ds <- as.vector(colnames_ds) - colnames_ds <- as.character(colnames_ds) - # Trim any trailing NULs (hex 00) and surrounding whitespace for cross-language string compatibility - # Use escaped hex in pattern to avoid embedding a NUL in the source code - colnames_ds <- gsub("[\\x00]+$", "", colnames_ds, perl = TRUE, useBytes = TRUE) - colnames_ds <- trimws(colnames_ds) - sources[[x]] <- colnames_ds - } else { - sources[[x]] <- as.character(colnames_attr) } # transpose scalar_data[[x]] if needed: @@ -274,9 +334,9 @@ ModelArray <- function(filepath, # user did not request any analyses; do not touch /results results_data <- list() } else { + if (backend == "hdf5") { # user requested analyses; check if results group exists in this .h5 file flag_results_exist <- flagResultsGroupExistInh5(filepath) - # message(flag_results_exist) if (flag_results_exist == FALSE) { results_data <- list() } else { @@ -291,20 +351,10 @@ ModelArray <- function(filepath, if (flag_analysis_exist == FALSE) { stop(paste0("This analysis: ", analysis_name, " does not exist...")) } else { - # exists - # /results//has_names: names_results_matrix <- rhdf5::h5readAttributes(filepath, name = sprintf("results/%s/results_matrix", analysis_name) )$colnames # after updating writeResults() - # names_results_matrix <- ModelArraySeed(filepath, name = sprintf( - # "results/%s/has_names", analysis_name), type = NA) %>% - # DelayedArray::DelayedArray() - # if (dim(names_results_matrix)[1]/results_matrix: results_data[[x]]$results_matrix <- ModelArraySeed( filepath, name = sprintf("results/%s/results_matrix", analysis_name), @@ -318,8 +368,6 @@ ModelArray <- function(filepath, colnames(results_data[[x]]$results_matrix) <- as.character(DelayedArray::realize(names_results_matrix)) # designate the column names - - # /results//lut_col?: # LOOP OVER # OF COL OF $RESULTS_MATRIX, AND SEE IF THERE IS LUT_COL for (i_col in seq_along(names_results_matrix)) { object_name <- paste0("lut_forcol", as.character(i_col)) flag_lut_exist <- flagObjectExistInh5( @@ -334,29 +382,78 @@ ModelArray <- function(filepath, type = NA ) %>% DelayedArray::DelayedArray() - # results_data[[x]]$lut[[i_col]] <- lut - - # turn values in results_matrix into factors | - # HOWEVER, this also makes the entire $results_matrix into type "character".... lut <- lut %>% as.character() for (j_lut in seq_along(lut)) { str_lut <- lut[j_lut] idx_list <- results_data[[x]]$results_matrix[, i_col] %in% c(j_lut) results_data[[x]]$results_matrix[idx_list, i_col] <- lut[j_lut] } - - # } else { # the lut for this column does not exist - # results_data[[x]]$lut[[i_col]] <- NULL } } # name the analysis: names(results_data)[[x]] <- analysis_name + } + } + } + } else if (backend == "tiledb") { + if (!requireNamespace("TileDBArray", quietly = TRUE)) { + stop("backend='tiledb' requires the TileDBArray package. Please install it.") + } + if (!requireNamespace("tiledb", quietly = TRUE)) { + stop("backend='tiledb' requires the tiledb package. Please install it.") + } + flag_results_exist <- flagResultsGroupExistInTileDB(filepath) + if (flag_results_exist == FALSE) { + results_data <- list() + } else { + results_data <- vector("list", length(analysis_names)) + + for (x in seq_along(analysis_names)) { + analysis_name <- analysis_names[x] + flag_analysis_exist <- flagAnalysisExistInTileDB(filepath, analysis_name = analysis_name) + if (flag_analysis_exist == FALSE) { + stop(paste0("This analysis: ", analysis_name, " does not exist...")) + } + + names_results_matrix <- TileDBArray::TileDBArray( + file.path(filepath, sprintf("results/%s/column_names", analysis_name)) + ) %>% + DelayedArray::realize() %>% + as.character() + + results_matrix_uri <- file.path(filepath, sprintf("results/%s/results_matrix", analysis_name)) + results_data[[x]]$results_matrix <- TileDBArray::TileDBArray(results_matrix_uri) + + if (dim(results_data[[x]]$results_matrix)[2] != length(names_results_matrix)) { + results_data[[x]]$results_matrix <- t(results_data[[x]]$results_matrix) + } - # NOTES: - # if there is no "$lut", we can remove "$results_matrix", so that results(ModelArray) - # would look like: $, instead of $$results_matrix + colnames(results_data[[x]]$results_matrix) <- names_results_matrix + + for (i_col in seq_along(names_results_matrix)) { + object_name <- paste0("lut_forcol", as.character(i_col)) + flag_lut_exist <- flagObjectExistInTileDB( + filepath, + group_path = file.path("results", analysis_name), + object_name = object_name + ) + if (flag_lut_exist == TRUE) { + lut_uri <- file.path(filepath, "results", analysis_name, object_name) + lut <- TileDBArray::TileDBArray(lut_uri) %>% + DelayedArray::realize() %>% + as.character() + + for (j_lut in seq_along(lut)) { + str_lut <- lut[j_lut] + idx_list <- results_data[[x]]$results_matrix[, i_col] %in% c(j_lut) + results_data[[x]]$results_matrix[idx_list, i_col] <- lut[j_lut] + } + } + } + + names(results_data)[[x]] <- analysis_name } } } @@ -1180,12 +1277,15 @@ analyseOneElement.wrap <- function(i_element, #' @param analysis_name A character, the name of the results #' @param overwrite If a group with the same analysis_name exists in HDF5 file, #' whether overwrite it (TRUE) or not (FALSE) +#' @param backend Storage backend to use ("hdf5" or "tiledb"). Default: "hdf5". #' @import hdf5r #' @export writeResults <- function(fn.output, df.output, analysis_name = "myAnalysis", - overwrite = TRUE) { + overwrite = TRUE, + backend = c("hdf5", "tiledb")) { + backend <- match.arg(backend) # This is enhanced version with: 1) change to hdf5r; 2) write results with only one row for one element # check "df.output" @@ -1193,7 +1293,35 @@ writeResults <- function(fn.output, stop("Results dataset is not correct; must be data of type `data.frame`") } + # normalize column types: cast non-numeric columns to numeric and capture LUTs + lut_values <- vector("list", ncol(df.output)) + for (i_col in seq_len(ncol(df.output))) { + col_class <- as.character(sapply(df.output, class)[i_col]) + if (!col_class %in% c("numeric", "integer")) { + message( + paste0( + "the column #", + as.character(i_col), + " of df.output to save: ", + "data class is not numeric or integer...fixing it" + ) + ) + factors <- df.output %>% + pull(., var = i_col) %>% + factor() + df.output[, i_col] <- factors %>% + as.numeric() + lut_values[[i_col]] <- levels(factors) + } + } + + if (backend == "hdf5") { fn.output.h5 <- hdf5r::H5File$new(fn.output, mode = "a") + on.exit({ + if (!is.null(fn.output.h5)) { + try(fn.output.h5$close_all(), silent = TRUE) + } + }, add = TRUE) # open; "a": creates a new file or opens an existing one for read/write # check if group "results" already exists! @@ -1208,7 +1336,6 @@ writeResults <- function(fn.output, if (results.grp$exists(analysis_name) == TRUE && overwrite == FALSE) { warning(paste0(analysis_name, " exists but not to overwrite!")) - # TODO: add checker for exisiting analysis_name, esp the matrix size results.analysis.grp <- results.grp$open(analysis_name) results_matrix_ds <- results.analysis.grp[["results_matrix"]] } else { @@ -1217,64 +1344,275 @@ writeResults <- function(fn.output, overwrite == TRUE) { # delete existing one first results.grp$link_delete(analysis_name) - # NOTE: the file size will not shrink after your deletion.. - # this is because of HDF5, regardless of package of hdf5r or rhdf5 - # TODO: add a garbage collector after saving the results } # create: results.analysis.grp <- results.grp$create_group(analysis_name) # create a subgroup called analysis_name under results.grp - # check "df.output": make sure all columns are floats (i.e. numeric) - for (i_col in seq(1, ncol(df.output), by = 1)) { - # for each column of df.output - col_class <- as.character(sapply(df.output, class)[i_col]) # class of this column - - if ((col_class != "numeric") && - (col_class != "integer")) { - # the column class is not numeric or integer - message( - paste0( - "the column #", - as.character(i_col), - " of df.output to save: ", - "data class is not numeric or integer...fixing it" - ) - ) + # write LUTs for non-numeric columns + for (i_col in seq_len(ncol(df.output))) { + lut <- lut_values[[i_col]] + if (!is.null(lut)) { + results.analysis.grp[[paste0("lut_forcol", as.character(i_col))]] <- lut + } + } + + # save: + results.analysis.grp[["results_matrix"]] <- as.matrix(df.output) + + # attach column names: + hdf5r::h5attr(results.analysis.grp[["results_matrix"]], "colnames") <- colnames(df.output) + } + } else if (backend == "tiledb") { + if (!requireNamespace("tiledb", quietly = TRUE)) { + stop("backend='tiledb' requires the tiledb package. Please install it.") + } - # turn into numeric && write the notes in .h5 file...: - factors <- df.output %>% - pull(., var = i_col) %>% - factor() - df.output[, i_col] <- df.output %>% - pull(., var = i_col) %>% - factor() %>% - as.numeric(.) # change into numeric of 1,2,3.... - - # write a LUT for this column: - results.analysis.grp[[paste0("lut_forcol", as.character(i_col))]] <- levels(factors) - # save lut to .h5/results//lut_col + results_uri <- file.path(fn.output, "results") + analysis_uri <- file.path(results_uri, analysis_name) + + results_type <- tryCatch(tiledb::tiledb_object_type(results_uri), error = function(e) "NOT_TILEDB") + if (results_type == "NOT_TILEDB") { + tiledb::tiledb_group_create(results_uri) + } + + analysis_type <- tryCatch(tiledb::tiledb_object_type(analysis_uri), error = function(e) "NOT_TILEDB") + if (analysis_type != "NOT_TILEDB") { + if (overwrite == FALSE) { + warning(paste0(analysis_name, " exists but not to overwrite!")) + return(invisible(NULL)) + } else { + tiledb::tiledb_object_remove(analysis_uri) } } + tiledb::tiledb_group_create(analysis_uri) + + # results matrix array + nr <- nrow(df.output) + nc <- ncol(df.output) + dom <- tiledb::tiledb_domain(list( + tiledb::tiledb_dim("row", domain = c(0L, as.integer(nr - 1)), tile = max(1L, min(nr, 1024L)), type = "INT64"), + tiledb::tiledb_dim("col", domain = c(0L, as.integer(nc - 1)), tile = max(1L, min(nc, 1024L)), type = "INT64") + )) + attr <- tiledb::tiledb_attr("values", type = "FLOAT64") + schema <- tiledb::tiledb_array_schema(dom, attrs = list(attr), sparse = FALSE) + results_matrix_uri <- file.path(analysis_uri, "results_matrix") + tiledb::tiledb_array_create(results_matrix_uri, schema) + arr <- tiledb::tiledb_array(results_matrix_uri, query_type = "WRITE") + arr[] <- as.matrix(df.output) + + # column names array + dom_cn <- tiledb::tiledb_domain(list( + tiledb::tiledb_dim("idx", domain = c(0L, as.integer(nc - 1)), tile = max(1L, min(nc, 1024L)), type = "INT64") + )) + attr_cn <- tiledb::tiledb_attr("names", type = "UTF8") + schema_cn <- tiledb::tiledb_array_schema(dom_cn, attrs = list(attr_cn), sparse = FALSE) + cn_uri <- file.path(analysis_uri, "column_names") + tiledb::tiledb_array_create(cn_uri, schema_cn) + cn_arr <- tiledb::tiledb_array(cn_uri, query_type = "WRITE") + cn_arr[] <- colnames(df.output) + + # LUT arrays for non-numeric columns + for (i_col in seq_len(ncol(df.output))) { + lut <- lut_values[[i_col]] + if (is.null(lut)) next + lut_uri <- file.path(analysis_uri, paste0("lut_forcol", as.character(i_col))) + dom_lut <- tiledb::tiledb_domain(list( + tiledb::tiledb_dim("idx", domain = c(1L, as.integer(length(lut))), tile = max(1L, min(length(lut), 1024L)), type = "INT64") + )) + attr_lut <- tiledb::tiledb_attr("values", type = "UTF8") + schema_lut <- tiledb::tiledb_array_schema(dom_lut, attrs = list(attr_lut), sparse = FALSE) + tiledb::tiledb_array_create(lut_uri, schema_lut) + lut_arr <- tiledb::tiledb_array(lut_uri, query_type = "WRITE") + lut_arr[] <- lut + } + } +} - # save: - results.analysis.grp[["results_matrix"]] <- as.matrix(df.output) - # results_matrix_ds <- results.analysis.grp[["results_matrix"]] # name it - # attach column names: - hdf5r::h5attr(results.analysis.grp[["results_matrix"]], "colnames") <- colnames(df.output) - # NOTES: update ConFixel correspondingly +#' Write phenotypes data.frame to a TileDB group +#' +#' @description +#' Store a mixed-type phenotypes data.frame inside a TileDB group so it can be +#' colocated with scalar/result arrays. The data are written as a dense array at +#' `//data` with one attribute per column, plus sidecar +#' arrays for column classes, factor levels, and POSIXct time zones. +#' +#' @param uri_root TileDB group root to write into (same root as ModelArray scalars/results). +#' @param phenotypes data.frame to persist; rows should be ordered to match scalar column order. +#' @param group_name Subgroup name under `uri_root` (default: "phenotypes"). +#' @param overwrite Whether to remove any existing phenotypes group before writing. +#' @export +writePhenotypesTileDB <- function(uri_root, + phenotypes, + group_name = "phenotypes", + overwrite = FALSE) { + if (!requireNamespace("tiledb", quietly = TRUE)) { + stop("backend='tiledb' requires the tiledb package. Please install it.") + } + if (!is.data.frame(phenotypes)) { + stop("phenotypes must be a data.frame") } - # # return: # will not work if fn.output.h5$close_all() - # output_list <- list(results.grp = results.grp, - # results.analysis.grp = results.analysis.grp, - # results_matrix_ds = results_matrix_ds) - # return(output_list) + phen_uri <- file.path(uri_root, group_name) + existing_type <- tryCatch( + tiledb::tiledb_object_type(phen_uri), + error = function(e) "NOT_TILEDB" + ) + if (existing_type != "NOT_TILEDB") { + if (!overwrite) { + stop(paste0("Phenotypes group already exists at ", phen_uri, "; set overwrite=TRUE to replace it.")) + } + tiledb::tiledb_object_remove(phen_uri) + } + tiledb::tiledb_group_create(phen_uri) + + n_rows <- nrow(phenotypes) + n_cols <- ncol(phenotypes) + tile_rows <- max(1L, min(n_rows, 1024L)) + dom <- tiledb::tiledb_domain(list( + tiledb::tiledb_dim("row", + domain = c(0L, as.integer(max(n_rows - 1, 0))), + tile = tile_rows, + type = "INT64" + ) + )) + + attrs <- vector("list", n_cols) + data_to_write <- list() + col_classes <- character(n_cols) + col_levels <- character(n_cols) + col_tz <- character(n_cols) + + for (i in seq_len(n_cols)) { + colname <- colnames(phenotypes)[i] + col <- phenotypes[[i]] + col_classes[i] <- paste(class(col), collapse = "|") + col_levels[i] <- "" + col_tz[i] <- "" + + if (inherits(col, "Date")) { + attrs[[i]] <- tiledb::tiledb_attr(colname, type = "INT64") + data_to_write[[colname]] <- as.integer(col) + } else if (inherits(col, "POSIXct")) { + attrs[[i]] <- tiledb::tiledb_attr(colname, type = "FLOAT64") + data_to_write[[colname]] <- as.numeric(col) + tz_val <- attr(col, "tzone") + if (!is.null(tz_val)) { + col_tz[i] <- paste(tz_val, collapse = "|") + } + } else if (is.factor(col)) { + attrs[[i]] <- tiledb::tiledb_attr(colname, type = "UTF8") + data_to_write[[colname]] <- as.character(col) + col_levels[i] <- paste(levels(col), collapse = "||") + } else if (is.logical(col)) { + attrs[[i]] <- tiledb::tiledb_attr(colname, type = "INT8") + data_to_write[[colname]] <- as.integer(col) + } else if (is.integer(col)) { + attrs[[i]] <- tiledb::tiledb_attr(colname, type = "INT64") + data_to_write[[colname]] <- as.integer(col) + } else if (is.numeric(col)) { + attrs[[i]] <- tiledb::tiledb_attr(colname, type = "FLOAT64") + data_to_write[[colname]] <- as.numeric(col) + } else if (is.character(col)) { + attrs[[i]] <- tiledb::tiledb_attr(colname, type = "UTF8") + data_to_write[[colname]] <- as.character(col) + } else { + stop(paste0("Column ", colname, " has unsupported class: ", paste(class(col), collapse = ", "))) + } + } - fn.output.h5$close_all() + schema <- tiledb::tiledb_array_schema(dom, attrs = attrs, sparse = FALSE) + data_uri <- file.path(phen_uri, "data") + tiledb::tiledb_array_create(data_uri, schema) + arr <- tiledb::tiledb_array(data_uri, query_type = "WRITE") + arr[] <- as.data.frame(data_to_write, optional = TRUE) + + write_string_vector_array <- function(vec, uri) { + dom_vec <- tiledb::tiledb_domain(list( + tiledb::tiledb_dim("idx", + domain = c(0L, as.integer(max(length(vec) - 1, 0))), + tile = max(1L, min(length(vec), 1024L)), + type = "INT64" + ) + )) + attr_vec <- tiledb::tiledb_attr("values", type = "UTF8") + schema_vec <- tiledb::tiledb_array_schema(dom_vec, attrs = list(attr_vec), sparse = FALSE) + tiledb::tiledb_array_create(uri, schema_vec) + arr_vec <- tiledb::tiledb_array(uri, query_type = "WRITE") + arr_vec[] <- vec + } + write_string_vector_array(colnames(phenotypes), file.path(phen_uri, "column_names")) + write_string_vector_array(col_classes, file.path(phen_uri, "column_classes")) + write_string_vector_array(col_levels, file.path(phen_uri, "column_levels")) + write_string_vector_array(col_tz, file.path(phen_uri, "column_tz")) + + invisible(NULL) +} + + +#' Read phenotypes previously stored in TileDB +#' +#' @param uri_root TileDB group root containing phenotypes. +#' @param group_name Subgroup name where phenotypes were written (default: "phenotypes"). +#' @export +readPhenotypesTileDB <- function(uri_root, group_name = "phenotypes") { + if (!requireNamespace("tiledb", quietly = TRUE)) { + stop("backend='tiledb' requires the tiledb package. Please install it.") + } + + phen_uri <- file.path(uri_root, group_name) + data_uri <- file.path(phen_uri, "data") + + arr <- tiledb::tiledb_array(data_uri, query_type = "READ", return_as = "data.frame") + df <- arr[] + if ("row" %in% colnames(df)) { + df[["row"]] <- NULL + } + + read_string_vector_array <- function(uri) { + arr_vec <- tiledb::tiledb_array(uri, query_type = "READ", return_as = "data.frame") + vals <- arr_vec[]$values + vals + } + + col_classes <- read_string_vector_array(file.path(phen_uri, "column_classes")) + col_levels <- read_string_vector_array(file.path(phen_uri, "column_levels")) + col_tz <- read_string_vector_array(file.path(phen_uri, "column_tz")) + + for (i in seq_along(df)) { + colname <- colnames(df)[i] + classes_str <- col_classes[i] + classes <- if (!is.na(classes_str) && nzchar(classes_str)) strsplit(classes_str, "\\|")[[1]] else character(0) + + if ("factor" %in% classes) { + levs_str <- col_levels[i] + levs <- if (!is.na(levs_str) && nzchar(levs_str)) strsplit(levs_str, "\\|\\|")[[1]] else unique(df[[i]]) + df[[i]] <- factor(df[[i]], levels = levs) + } else if ("logical" %in% classes) { + df[[i]] <- as.logical(df[[i]]) + } else if ("integer" %in% classes) { + df[[i]] <- as.integer(df[[i]]) + } else if ("Date" %in% classes) { + df[[i]] <- structure(as.integer(df[[i]]), class = "Date") + } else if ("POSIXct" %in% classes) { + tz <- col_tz[i] + tz <- if (!is.na(tz) && nzchar(tz)) strsplit(tz, "\\|")[[1]][1] else "UTC" + df[[i]] <- as.POSIXct(df[[i]], origin = "1970-01-01", tz = tz) + } else if ("character" %in% classes) { + df[[i]] <- as.character(df[[i]]) + } else { + # default: keep as-is (numeric) + df[[i]] <- as.numeric(df[[i]]) + } + + # Restore ordered factor if present in class string + if ("ordered" %in% classes && inherits(df[[i]], "factor")) { + df[[i]] <- factor(df[[i]], levels = levels(df[[i]]), ordered = TRUE) + } + } - # message("Results file written!") + df } diff --git a/R/ModelArray_S4Methods.R b/R/ModelArray_S4Methods.R index 5292069..f817237 100644 --- a/R/ModelArray_S4Methods.R +++ b/R/ModelArray_S4Methods.R @@ -137,10 +137,10 @@ setGeneric("exampleElementData", function(x, ...) standardGeneric("exampleElemen #' per-element data that `ModelArray.wrap` passes to user functions (`data = dat`). #' #' @param x An ModelArray object -#' @param scalar A character. The name of the element-wise scalar to append +#' @param scalar A character vector. One or more element-wise scalar names to append #' @param i_element An integer, the i_th element (1-based) #' @param phenotypes A data.frame of the cohort with independent variables/covariates -#' @return A data.frame with the additional response column named by `scalar` +#' @return A data.frame with additional response column(s) named by `scalar` #' @examples #' \dontrun{ #' h5_path <- system.file("extdata", "n50_fixels.h5", package = "ModelArray") @@ -157,16 +157,58 @@ setMethod( if (!is.data.frame(phenotypes)) { stop("phenotypes must be a data.frame") } - if (!(scalar %in% names(scalars(x)))) { - stop("scalar not found in modelarray; use one of names(scalars(x))") + if (length(scalar) < 1L) { + stop("scalar must contain at least one scalar name") } - num_elements <- nrow(scalars(x)[[scalar]]) + missing_scalars <- setdiff(scalar, names(scalars(x))) + if (length(missing_scalars) > 0) { + stop(paste0( + "scalar not found in modelarray: ", + paste(missing_scalars, collapse = ", "), + "; use one of names(scalars(x))" + )) + } + + num_elements <- nrow(scalars(x)[[scalar[[1]]]]) if (length(i_element) != 1L || is.na(i_element) || i_element < 1L || i_element > num_elements) { stop("i_element is out of range") } - dat <- phenotypes - dat[[scalar]] <- scalars(x)[[scalar]][i_element, ] + # If phenotypes is in long format (one row per scalar), subset to a single scalar + if ("scalar_name" %in% colnames(phenotypes) && length(unique(phenotypes$scalar_name)) > 1) { + dat <- subset(phenotypes, scalar_name == scalar[[1]]) + } else { + dat <- phenotypes + } + + # Align / validate source ordering against the ModelArray sources + src_modelarray <- sources(x)[[scalar[[1]]]] + src_pheno <- dat[["source_file"]] + if (is.null(src_pheno)) { + stop("phenotypes must contain a 'source_file' column for alignment") + } + if (length(src_modelarray) != length(src_pheno)) { + stop("Length of phenotypes$source_file does not match ModelArray sources") + } + if (length(src_modelarray) != length(unique(src_modelarray))) { + stop("ModelArray sources are not unique; cannot align phenotypes") + } + if (length(src_pheno) != length(unique(src_pheno))) { + stop("phenotypes$source_file entries are not unique; cannot align") + } + if (!identical(src_modelarray, src_pheno)) { + if ((all(src_modelarray %in% src_pheno)) && (all(src_pheno %in% src_modelarray))) { + reorder_idx <- match(src_modelarray, src_pheno) + dat <- dat[reorder_idx, , drop = FALSE] + row.names(dat) <- NULL + } else { + stop("phenotypes$source_file entries differ from ModelArray sources; cannot align") + } + } + + for (s in scalar) { + dat[[s]] <- scalars(x)[[s]][i_element, ] + } dat } ) diff --git a/R/utils.R b/R/utils.R index 6a3c085..0a15349 100644 --- a/R/utils.R +++ b/R/utils.R @@ -7,6 +7,9 @@ #' @importFrom dplyr filter #' @importFrom rlang .data flagObjectExistInh5 <- function(fn_h5, group_name = "/results", object_name = "myAnalysis") { + if (!requireNamespace("rhdf5", quietly = TRUE)) { + stop("rhdf5 is required for HDF5 operations but is not installed.") + } rhdf5::h5closeAll() h5 <- rhdf5::h5ls(fn_h5) @@ -33,6 +36,9 @@ flagObjectExistInh5 <- function(fn_h5, group_name = "/results", object_name = "m #' @importFrom dplyr filter #' @importFrom rlang .data flagResultsGroupExistInh5 <- function(fn_h5) { + if (!requireNamespace("rhdf5", quietly = TRUE)) { + stop("rhdf5 is required for HDF5 operations but is not installed.") + } rhdf5::h5closeAll() h5 <- rhdf5::h5ls(fn_h5) @@ -57,6 +63,9 @@ flagResultsGroupExistInh5 <- function(fn_h5) { #' @importFrom dplyr filter #' @importFrom rlang .data flagAnalysisExistInh5 <- function(fn_h5, analysis_name) { + if (!requireNamespace("rhdf5", quietly = TRUE)) { + stop("rhdf5 is required for HDF5 operations but is not installed.") + } rhdf5::h5closeAll() h5 <- rhdf5::h5ls(fn_h5) @@ -74,6 +83,61 @@ flagAnalysisExistInh5 <- function(fn_h5, analysis_name) { } +#' check if TileDB results group exists at the root URI +#' @param uri_root TileDB group root URI +#' @noRd +flagResultsGroupExistInTileDB <- function(uri_root) { + if (!requireNamespace("tiledb", quietly = TRUE)) { + stop("tiledb is required for TileDB operations but is not installed.") + } + + type <- tryCatch( + tiledb::tiledb_object_type(file.path(uri_root, "results")), + error = function(e) NA_character_ + ) + + identical(type, "GROUP") +} + + +#' check if a TileDB analysis group exists under results +#' @param uri_root TileDB group root URI +#' @param analysis_name Name of the analysis group +#' @noRd +flagAnalysisExistInTileDB <- function(uri_root, analysis_name) { + if (!requireNamespace("tiledb", quietly = TRUE)) { + stop("tiledb is required for TileDB operations but is not installed.") + } + + type <- tryCatch( + tiledb::tiledb_object_type(file.path(uri_root, "results", analysis_name)), + error = function(e) NA_character_ + ) + + identical(type, "GROUP") +} + + +#' check if a TileDB array/object exists under a group +#' @param uri_root TileDB group root URI +#' @param group_path Relative path from root to the group containing the object +#' @param object_name Object (array) name +#' @noRd +flagObjectExistInTileDB <- function(uri_root, group_path, object_name) { + if (!requireNamespace("tiledb", quietly = TRUE)) { + stop("tiledb is required for TileDB operations but is not installed.") + } + + uri <- file.path(uri_root, group_path, object_name) + type <- tryCatch( + tiledb::tiledb_object_type(uri), + error = function(e) NA_character_ + ) + + identical(type, "ARRAY") +} + + #' print the additional arguments settings #' @param FUN The function, e.g. mgcv::gam, without "()" #' @param argu_name The argument name of the function diff --git a/vignettes/voxel-wise_data.Rmd b/vignettes/voxel-wise_data.Rmd index 1896791..6442a4b 100644 --- a/vignettes/voxel-wise_data.Rmd +++ b/vignettes/voxel-wise_data.Rmd @@ -7,7 +7,11 @@ vignette: > %\VignetteEncoding{UTF-8} --- -`ModelArray` is compatible with not only fixel-wise data, but also voxel-wise data. The `vignette("walkthrough")` page has shown example walkthrough with example fixel-wise data, with hints for voxel-wise data provided throughout the walkthrough. In general, the workflow of applying `ModelArray` to voxel-wise data is consistent to that of fixel-wise data, but please note several important differences when execution described as below. +`ModelArray` is compatible with not only fixel-wise data, but also voxel-wise data. +The `vignette("walkthrough")` page has shown example walkthrough with example fixel-wise data, +with hints for voxel-wise data provided throughout the walkthrough. +In general, the workflow of applying `ModelArray` to voxel-wise data is consistent to that of fixel-wise data, +but please note several important differences when execution described as below. ## Conversion of voxel-wise data format When converting voxel-wise data between NIfTI and HDF5 file format that `ModelArray` requires, you will still use `ConFixel` software, but a different converter tailored for voxel-wise data, `ConVoxel`. We have provided a walkthrough of how to use it [here](https://github.com/PennLINC/ConFixel/blob/main/notebooks/walkthrough_voxel-wise_data.md). diff --git a/vignettes/walkthrough.Rmd b/vignettes/walkthrough.Rmd index 1304650..946303d 100644 --- a/vignettes/walkthrough.Rmd +++ b/vignettes/walkthrough.Rmd @@ -464,3 +464,27 @@ You may change the view by clicking "View" in the upper panel. Below is an examp ![An example view](mrview_results_lm_demo_FDC_n100.PNG) + +## Optional: store phenotypes inside TileDB + +If you are using the TileDB backend, you can package the cohort phenotypes alongside the scalar/results arrays. This keeps everything self-contained and avoids shipping a separate CSV. We provide two helpers: + +```r +# write phenotypes into /phenotypes (dense array, one attr per column) +writePhenotypesTileDB( + uri_root = "/path/to/tiledb_root", + phenotypes = phen_df, # data.frame; rows must align to scalars' column_names order + overwrite = FALSE +) + +# read them back later +phen_from_tdb <- readPhenotypesTileDB( + uri_root = "/path/to/tiledb_root" +) +``` + +Notes: + +- The row order must match the subject order used in `ModelArray()` (i.e., the same order as `scalars//column_names`). If your CSV differs, reorder it first. +- Mixed datatypes are supported (numeric, integer, logical, character, factor, Date, POSIXct). Factors are stored as strings with their original levels preserved; Dates are stored as days since epoch; POSIXct as seconds (time zone preserved when available). +- Data are written under `phenotypes/` with sidecar arrays for column classes/levels/time zones. Overwrite only when you intend to replace existing phenotypes.