More refactoring to use dtype class

keller-mark · Aug 27, 2023 · 988a76d · 988a76d
1 parent 987fc33
commit 988a76d
Show file tree

Hide file tree

Showing 8 changed files with 148 additions and 115 deletions.
diff --git a/R/array-nested.R b/R/array-nested.R
@@ -53,6 +53,8 @@ NestedArray <- R6::R6Class("NestedArray",
     shape = NULL,
     #' @field dtype The Zarr dtype of the array, as a string like ">f8".
     dtype = NULL,
+    #' @field dtype_obj The Zarr dtype of the array, as a Dtype instance.
+    dtype_obj = NULL,
     #' @field data The array contents as a base R array.
     data = NULL,
     #' @description
@@ -75,25 +77,27 @@ NestedArray <- R6::R6Class("NestedArray",
         shape <- normalize_shape(shape)
       }
       if(is_na(dtype) && (is.numeric(data) || is.logical(data))) {
-        dtype <- get_dtype_from_array(data)
+        self$dtype_obj <- Dtype$new(get_dtype_from_array(data))
+      } else if("Dtype" %in% class(dtype)) {
+        self$dtype_obj <- dtype
+      } else if(is.character(dtype)) {
+        self$dtype_obj <- Dtype$new(dtype)
+        if(self$dtype_obj$is_object) {
+          stop("Object dtype was initialized from string in NestedArray, so object_codec is missing.")
+        }
       } else {
-        dtype <- normalize_dtype(dtype)
+        stop("dtype must be NA, string/character vector, or Dtype instance")
       }
       self$shape <- shape
-      self$dtype <- dtype
-
-      dtype_parts <- get_dtype_parts(dtype)
-      private$dtype_basic_type <- dtype_parts$basic_type
-      private$dtype_byte_order <- dtype_parts$byte_order
-      private$dtype_num_bytes <- dtype_parts$num_bytes
-      private$dtype_num_items <- dtype_parts$num_items
 
       private$is_zero_dim <- (is.null(shape) || length(shape) == 0)
 
       if(is.null(data)) {
         # Create empty array.
 
-        self$data <- array(data=get_dtype_rtype(dtype), dim=shape)
+        dtype_rtype <- self$dtype_obj$get_rtype()
+
+        self$data <- array(data=dtype_rtype, dim=shape)
       } else if(!is.raw(data) && is.null(self$shape)) {
         # Create zero-dimensional array.
 
@@ -107,7 +111,7 @@ NestedArray <- R6::R6Class("NestedArray",
           self$data <- data
         } else {
           # Data array did not have the expected shape, so we need to reshape it.
-          astype_func <- get_dtype_asrtype(dtype)
+          astype_func <- self$dtype_obj$get_asrtype()
           self$data <- array(data=as.array(astype_func(data)), dim=shape)
         }
       } else if(is.raw(data)) {
@@ -119,30 +123,30 @@ NestedArray <- R6::R6Class("NestedArray",
         buf <- data
         # Create from ArrayBuffer or Buffer
 
-        dtype_size <- private$dtype_num_bytes
+        dtype_size <- self$dtype_obj$num_bytes
         num_data_elements <- length(buf) / dtype_size
         if (num_shape_elements != num_data_elements) {
           stop('Buffer has ${numDataElements} of dtype ${dtype}, shape is too large or small')
         }
 
-        dtype_rtype <- get_dtype_rtype(dtype)
-        dtype_signed <- get_dtype_signed(dtype)
+        dtype_rtype <- self$dtype_obj$get_rtype()
+        dtype_signed <- self$dtype_obj$is_signed
         if(!dtype_signed && !(dtype_size == 1 || dtype_size == 2)) {
           # readBin will warn "signed = FALSE is only valid for integers of sizes 1 and 2"
           dtype_signed <- TRUE
         }
 
-        endian <- get_dtype_endianness(self$dtype)
+        endian <- self$dtype_obj$byte_order
         # Normalize to only "little" or "big" since this is what writeBin accepts.
         if(endian == "nr") {
           endian <- "little"
         }
 
-        if(private$dtype_basic_type %in% c("S", "U")) {
+        if(self$dtype_obj$basic_type %in% c("S", "U")) {
           vec_from_raw <- raw_to_char_vec(
             buf,
-            private$dtype_basic_type,
-            private$dtype_num_items,
+            self$dtype_obj$basic_type,
+            self$dtype_obj$num_items,
             endian
           )
         } else {
@@ -175,11 +179,12 @@ NestedArray <- R6::R6Class("NestedArray",
         self$data <- array_from_vec
       } else if(is_scalar(data)) {
         # Create array from a scalar value.
-        astype_func <- get_dtype_asrtype(dtype)
+        astype_func <- self$dtype_obj$get_asrtype()
+        dtype_rtype <- self$dtype_obj$get_rtype()
         if(private$is_zero_dim) {
-          self$data <- array(data=get_dtype_rtype(dtype), dim=c(1))
+          self$data <- array(data=dtype_rtype, dim=c(1))
         } else {
-          self$data <- array(data=get_dtype_rtype(dtype), dim=shape)
+          self$data <- array(data=dtype_rtype, dim=shape)
         }
         self$data[] <- astype_func(data)
       } else {
@@ -199,7 +204,7 @@ NestedArray <- R6::R6Class("NestedArray",
       # Using do.call here seems to work the same as `abind::asub(self$data, selection_list)`
       # so we can use do.call to avoid the extra dependency.
       subset_arr <- do.call("[", append(list(self$data), selection_list))
-      subset_nested_array <- NestedArray$new(subset_arr, shape = dim(subset_arr), dtype = self$dtype)
+      subset_nested_array <- NestedArray$new(subset_arr, shape = dim(subset_arr), dtype = self$dtype_obj)
       return(subset_nested_array)
     },
     #' @description
@@ -265,26 +270,26 @@ NestedArray <- R6::R6Class("NestedArray",
     flatten_to_raw = function(order = NA) {
       data_as_vec <- self$flatten(order = order)
 
-      endian <- get_dtype_endianness(self$dtype)
+      endian <- self$dtype_obj$byte_order
       # Normalize to only "little" or "big" since this is what writeBin accepts.
       if(endian == "nr") {
         endian <- "little"
       }
 
       # "If writeBin is called with con a raw vector, it is just an indication that a raw vector should be returned."
       # Reference: https://stat.ethz.ch/R-manual/R-devel/library/base/html/readBin.html
-      if(private$dtype_basic_type %in% c("S", "U")) {
+      if(self$dtype_obj$basic_type %in% c("S", "U")) {
         buf <- char_vec_to_raw(
           data_as_vec,
-          private$dtype_basic_type,
-          private$dtype_num_items,
+          self$dtype_obj$basic_type,
+          self$dtype_obj$num_items,
           endian
         )
       } else {
         buf <- writeBin(
           data_as_vec,
           con = raw(),
-          size = private$dtype_num_bytes,
+          size = self$dtype_obj$num_bytes,
           endian = endian
         )
       }

diff --git a/R/creation.R b/R/creation.R
@@ -82,7 +82,7 @@ init_array_metadata <- function(
     }
 
     # normalize metadata
-    dtype <- normalize_dtype(dtype)
+    dtype <- normalize_dtype(dtype, object_codec = object_codec)
 
     # object_codec <- normalize_object_codec(dtype, object_codec) # TODO
 
@@ -91,7 +91,7 @@ init_array_metadata <- function(
 
     shape <- normalize_shape(shape)
 
-    dtype_itemsize <- get_dtype_numbytes(dtype)
+    dtype_itemsize <- dtype$num_bytes
     chunks <- normalize_chunks(chunks, shape, dtype_itemsize)
     order <- normalize_order(order)
     fill_value <- normalize_fill_value(fill_value, dtype)
@@ -130,21 +130,23 @@ init_array_metadata <- function(
         }
     }
 
-    # TODO: deal with object encoding
-    # if dtype.hasobject:
-    #     if object_codec is None:
-    #         if not filters:
-    #             # there are no filters so we can be sure there is no object codec
-    #             raise ValueError('missing object_codec for object array')
-    #         else:
-    #             # one of the filters may be an object codec, issue a warning rather
-    #             # than raise an error to maintain backwards-compatibility
-    #             warnings.warn('missing object_codec for object array; this will raise a '
-    #                           'ValueError in version 3.0', FutureWarning)
-    #     else:
-    #         filters_config.insert(0, object_codec.get_config())
-    # elif object_codec is not None:
-    #     warnings.warn('an object_codec is only needed for object arrays')
+    # Check object codec
+    if(dtype$is_object) {
+        if(is_na(object_codec)) {
+            if(length(filters_config) == 0) {
+                # there are no filters so we can be sure there is no object codec
+                stop("missing object_codec for object array")
+            } else {
+                # one of the filters may be an object codec, issue a warning rather
+                # than raise an error to maintain backwards-compatibility
+                stop("missing object_codec for object array")
+            }
+        } else {
+            filters_config <- append(filters_config, object_codec$get_config())
+        }
+    } else if(!is_na(object_codec)) {
+        warning("an object_codec is only needed for object arrays")
+    }
 
     # use null to indicate no filters
     if (length(filters_config) == 0) {

diff --git a/R/dtypes.R b/R/dtypes.R
@@ -1,12 +1,26 @@
 # Internal utility functions for converting between Zarr and R data types.
 
+#' @keywords internal
+get_dtype_from_array <- function(a) {
+  RTYPE_DTYPE_MAPPING <- list(
+    "logical" = "|b1",
+    "integer" = "<i4",
+    "double" = "<f8",
+    "character" = "|S8" # TODO: how many bytes to use here?
+  )
+  rtype_str <- typeof(a)
+  return(RTYPE_DTYPE_MAPPING[[rtype_str]])
+}
+
+#' @keywords internal
 is_structured_dtype <- function(dtype) {
   if(is.character(dtype) && length(dtype) == 1) {
     return(FALSE)
   }
   return(TRUE)
 }
 
+#' @keywords internal
 get_dtype_parts <- function(dtype) {
   # TODO: support object dtype (without digits required in regex)
   dtype_regex <- "^(\\||>|<)(b|i|u|f|c|m|M|S|U|V|O)(\\d+)"
@@ -32,6 +46,7 @@ get_dtype_parts <- function(dtype) {
   }
 }
 
+#' @keywords internal
 check_dtype_support <- function(dtype_parts) {
   if(!is_na(dtype_parts) && dtype_parts$basic_type %in% c("b", "i", "u", "f", "S", "U")) {
     return(TRUE)
@@ -40,12 +55,9 @@ check_dtype_support <- function(dtype_parts) {
   return(FALSE)
 }
 
-
-
 #' @keywords internal
 get_dtype_rtype <- function(dtype) {
   dtype_parts <- get_dtype_parts(dtype)
-  check_dtype_support(dtype_parts)
 
   # Reference: https://github.com/gzuidhof/zarr.js/blob/292804/src/nestedArray/types.ts#L32
   BASICTYPE_RTYPE_MAPPING <- list(
@@ -63,7 +75,6 @@ get_dtype_rtype <- function(dtype) {
 #' @keywords internal
 get_dtype_endianness <- function(dtype) {
   dtype_parts <- get_dtype_parts(dtype)
-  check_dtype_support(dtype_parts)
 
   DTYPE_ENDIANNESS_MAPPING <- list(
     "|" = "nr",
@@ -76,14 +87,12 @@ get_dtype_endianness <- function(dtype) {
 #' @keywords internal
 get_dtype_numbytes <- function(dtype) {
   dtype_parts <- get_dtype_parts(dtype)
-  check_dtype_support(dtype_parts)
   return(dtype_parts$num_bytes)
 }
 
 #' @keywords internal
 get_dtype_signed <- function(dtype) {
   dtype_parts <- get_dtype_parts(dtype)
-  check_dtype_support(dtype_parts)
 
   DTYPE_SIGNED_MAPPING <- list(
     "b" = FALSE,
@@ -96,18 +105,6 @@ get_dtype_signed <- function(dtype) {
   return(DTYPE_SIGNED_MAPPING[[dtype_parts$basic_type]])
 }
 
-#' @keywords internal
-get_dtype_from_array <- function(a) {
-  RTYPE_DTYPE_MAPPING <- list(
-    "logical" = "|b1",
-    "integer" = "<i4",
-    "double" = "<f8",
-    "character" = "|S8" # TODO: how many bytes to use here?
-  )
-  rtype_str <- typeof(a)
-  return(RTYPE_DTYPE_MAPPING[[rtype_str]])
-}
-
 #' @keywords internal
 get_dtype_asrtype <- function(dtype) {
   dtype_parts <- get_dtype_parts(dtype)
@@ -131,6 +128,8 @@ get_typed_array_ctr <- function(dtype) {
   return(function(dim) array(data = rtype, dim = dim))
 }
 
+# Reference: https://numpy.org/doc/stable/reference/arrays.dtypes.html
+
 #' The Zarr Dtype class.
 #' @title Dtype Class
 #' @docType class
@@ -140,28 +139,46 @@ get_typed_array_ctr <- function(dtype) {
 #' @keywords internal
 Dtype <- R6::R6Class("Dtype",
   public = list(
+    #' @field dtype The original dtype string, like "<f4".
     dtype = NULL,
+    #' @field byte_order The byte order of the dtype, either "little", "big", or "nr".
     byte_order = NULL,
+    #' @field basic_type The basic type of the dtype, like "f".
     basic_type = NULL,
+    #' @field num_bytes The number of bytes of the dtype.
     num_bytes = NULL,
+    #' @field num_items The number of items of the dtype.
     num_items = NULL,
+    #' @field is_signed Whether the dtype is signed. Logical/boolean.
+    is_signed = NULL,
+    #' @field is_structured Whether the dtype is structured. Logical/boolean.
+    is_structured = NULL,
+    #' @field is_object Whether the dtype is an object. Logical/boolean.
+    is_object = NULL,
+    #' @field object_codec The object codec instance.
+    object_codec = NULL,
     #' @description
     #' Create a new Dtype instance.
+    #' @param dtype The original dtype string, like "<f4".
+    #' @param object_codec The object codec instance.
     #' @return A `Dtype` instance.
-    initialize = function(dtype, object_codec) {
+    initialize = function(dtype, object_codec = NA) {
       self$dtype <- dtype
 
-      # TODO: support dtype_str == "|O" for object dtypes
+      # TODO: support dtype_str == "|O" for object dtypes / dont require numeric part of dtype string
 
       dtype_parts <- get_dtype_parts(dtype)
       check_dtype_support(dtype_parts)
-      self$byte_order <- dtype_parts$byte_order
+      self$byte_order <-  get_dtype_endianness(dtype)
       self$basic_type <- dtype_parts$basic_type
       self$num_bytes <- dtype_parts$num_bytes
       self$num_items <- dtype_parts$num_items
 
+      self$is_signed <- get_dtype_signed(dtype)
+      self$is_structured <- is_structured_dtype(dtype)
+      self$is_object <- (self$basic_type == "O")
+
       # TODO: port code from normalize_dtype in zarr-python
-
       self$object_codec <- object_codec
     },
     get_asrtype = function() {
@@ -170,26 +187,9 @@ Dtype <- R6::R6Class("Dtype",
     get_rtype = function() {
       return(get_dtype_rtype(self$dtype))
     },
-    is_signed = function() {
-      return(get_dtype_signed(self$dtype))
-    },
-    is_structured = function() {
-      return(is_structured_dtype(self$dtype))
-    },
-    is_object = function() {
-      return(self$basic_type == "O")
-    },
-    get_byte_order = function() {
-      return(self$byte_order)
-    },
-    get_basic_type = function() {
-      return(self$basic_type)
-    },
-    get_num_bytes = function() {
-      return(self$num_bytes)
-    },
-    get_num_items = function() {
-      return(self$num_items)
+    get_typed_array_ctr = function() {
+      rtype <- self$get_rtype()
+      return(function(dim) array(data = rtype, dim = dim))
     }
   )
 )