Skip to content

Commit

Permalink
More refactoring to use dtype class
Browse files Browse the repository at this point in the history
  • Loading branch information
keller-mark committed Aug 27, 2023
1 parent 987fc33 commit 988a76d
Show file tree
Hide file tree
Showing 8 changed files with 148 additions and 115 deletions.
59 changes: 32 additions & 27 deletions R/array-nested.R
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ NestedArray <- R6::R6Class("NestedArray",
shape = NULL,
#' @field dtype The Zarr dtype of the array, as a string like ">f8".
dtype = NULL,
#' @field dtype_obj The Zarr dtype of the array, as a Dtype instance.
dtype_obj = NULL,
#' @field data The array contents as a base R array.
data = NULL,
#' @description
Expand All @@ -75,25 +77,27 @@ NestedArray <- R6::R6Class("NestedArray",
shape <- normalize_shape(shape)
}
if(is_na(dtype) && (is.numeric(data) || is.logical(data))) {
dtype <- get_dtype_from_array(data)
self$dtype_obj <- Dtype$new(get_dtype_from_array(data))
} else if("Dtype" %in% class(dtype)) {
self$dtype_obj <- dtype
} else if(is.character(dtype)) {
self$dtype_obj <- Dtype$new(dtype)
if(self$dtype_obj$is_object) {
stop("Object dtype was initialized from string in NestedArray, so object_codec is missing.")
}
} else {
dtype <- normalize_dtype(dtype)
stop("dtype must be NA, string/character vector, or Dtype instance")
}
self$shape <- shape
self$dtype <- dtype

dtype_parts <- get_dtype_parts(dtype)
private$dtype_basic_type <- dtype_parts$basic_type
private$dtype_byte_order <- dtype_parts$byte_order
private$dtype_num_bytes <- dtype_parts$num_bytes
private$dtype_num_items <- dtype_parts$num_items

private$is_zero_dim <- (is.null(shape) || length(shape) == 0)

if(is.null(data)) {
# Create empty array.

self$data <- array(data=get_dtype_rtype(dtype), dim=shape)
dtype_rtype <- self$dtype_obj$get_rtype()

self$data <- array(data=dtype_rtype, dim=shape)
} else if(!is.raw(data) && is.null(self$shape)) {
# Create zero-dimensional array.

Expand All @@ -107,7 +111,7 @@ NestedArray <- R6::R6Class("NestedArray",
self$data <- data
} else {
# Data array did not have the expected shape, so we need to reshape it.
astype_func <- get_dtype_asrtype(dtype)
astype_func <- self$dtype_obj$get_asrtype()
self$data <- array(data=as.array(astype_func(data)), dim=shape)
}
} else if(is.raw(data)) {
Expand All @@ -119,30 +123,30 @@ NestedArray <- R6::R6Class("NestedArray",
buf <- data
# Create from ArrayBuffer or Buffer

dtype_size <- private$dtype_num_bytes
dtype_size <- self$dtype_obj$num_bytes
num_data_elements <- length(buf) / dtype_size
if (num_shape_elements != num_data_elements) {
stop('Buffer has ${numDataElements} of dtype ${dtype}, shape is too large or small')
}

dtype_rtype <- get_dtype_rtype(dtype)
dtype_signed <- get_dtype_signed(dtype)
dtype_rtype <- self$dtype_obj$get_rtype()
dtype_signed <- self$dtype_obj$is_signed
if(!dtype_signed && !(dtype_size == 1 || dtype_size == 2)) {
# readBin will warn "signed = FALSE is only valid for integers of sizes 1 and 2"
dtype_signed <- TRUE
}

endian <- get_dtype_endianness(self$dtype)
endian <- self$dtype_obj$byte_order
# Normalize to only "little" or "big" since this is what writeBin accepts.
if(endian == "nr") {
endian <- "little"
}

if(private$dtype_basic_type %in% c("S", "U")) {
if(self$dtype_obj$basic_type %in% c("S", "U")) {
vec_from_raw <- raw_to_char_vec(
buf,
private$dtype_basic_type,
private$dtype_num_items,
self$dtype_obj$basic_type,
self$dtype_obj$num_items,
endian
)
} else {
Expand Down Expand Up @@ -175,11 +179,12 @@ NestedArray <- R6::R6Class("NestedArray",
self$data <- array_from_vec
} else if(is_scalar(data)) {
# Create array from a scalar value.
astype_func <- get_dtype_asrtype(dtype)
astype_func <- self$dtype_obj$get_asrtype()
dtype_rtype <- self$dtype_obj$get_rtype()
if(private$is_zero_dim) {
self$data <- array(data=get_dtype_rtype(dtype), dim=c(1))
self$data <- array(data=dtype_rtype, dim=c(1))
} else {
self$data <- array(data=get_dtype_rtype(dtype), dim=shape)
self$data <- array(data=dtype_rtype, dim=shape)
}
self$data[] <- astype_func(data)
} else {
Expand All @@ -199,7 +204,7 @@ NestedArray <- R6::R6Class("NestedArray",
# Using do.call here seems to work the same as `abind::asub(self$data, selection_list)`
# so we can use do.call to avoid the extra dependency.
subset_arr <- do.call("[", append(list(self$data), selection_list))
subset_nested_array <- NestedArray$new(subset_arr, shape = dim(subset_arr), dtype = self$dtype)
subset_nested_array <- NestedArray$new(subset_arr, shape = dim(subset_arr), dtype = self$dtype_obj)
return(subset_nested_array)
},
#' @description
Expand Down Expand Up @@ -265,26 +270,26 @@ NestedArray <- R6::R6Class("NestedArray",
flatten_to_raw = function(order = NA) {
data_as_vec <- self$flatten(order = order)

endian <- get_dtype_endianness(self$dtype)
endian <- self$dtype_obj$byte_order
# Normalize to only "little" or "big" since this is what writeBin accepts.
if(endian == "nr") {
endian <- "little"
}

# "If writeBin is called with con a raw vector, it is just an indication that a raw vector should be returned."
# Reference: https://stat.ethz.ch/R-manual/R-devel/library/base/html/readBin.html
if(private$dtype_basic_type %in% c("S", "U")) {
if(self$dtype_obj$basic_type %in% c("S", "U")) {
buf <- char_vec_to_raw(
data_as_vec,
private$dtype_basic_type,
private$dtype_num_items,
self$dtype_obj$basic_type,
self$dtype_obj$num_items,
endian
)
} else {
buf <- writeBin(
data_as_vec,
con = raw(),
size = private$dtype_num_bytes,
size = self$dtype_obj$num_bytes,
endian = endian
)
}
Expand Down
36 changes: 19 additions & 17 deletions R/creation.R
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ init_array_metadata <- function(
}

# normalize metadata
dtype <- normalize_dtype(dtype)
dtype <- normalize_dtype(dtype, object_codec = object_codec)

# object_codec <- normalize_object_codec(dtype, object_codec) # TODO

Expand All @@ -91,7 +91,7 @@ init_array_metadata <- function(

shape <- normalize_shape(shape)

dtype_itemsize <- get_dtype_numbytes(dtype)
dtype_itemsize <- dtype$num_bytes
chunks <- normalize_chunks(chunks, shape, dtype_itemsize)
order <- normalize_order(order)
fill_value <- normalize_fill_value(fill_value, dtype)
Expand Down Expand Up @@ -130,21 +130,23 @@ init_array_metadata <- function(
}
}

# TODO: deal with object encoding
# if dtype.hasobject:
# if object_codec is None:
# if not filters:
# # there are no filters so we can be sure there is no object codec
# raise ValueError('missing object_codec for object array')
# else:
# # one of the filters may be an object codec, issue a warning rather
# # than raise an error to maintain backwards-compatibility
# warnings.warn('missing object_codec for object array; this will raise a '
# 'ValueError in version 3.0', FutureWarning)
# else:
# filters_config.insert(0, object_codec.get_config())
# elif object_codec is not None:
# warnings.warn('an object_codec is only needed for object arrays')
# Check object codec
if(dtype$is_object) {
if(is_na(object_codec)) {
if(length(filters_config) == 0) {
# there are no filters so we can be sure there is no object codec
stop("missing object_codec for object array")
} else {
# one of the filters may be an object codec, issue a warning rather
# than raise an error to maintain backwards-compatibility
stop("missing object_codec for object array")
}
} else {
filters_config <- append(filters_config, object_codec$get_config())
}
} else if(!is_na(object_codec)) {
warning("an object_codec is only needed for object arrays")
}

# use null to indicate no filters
if (length(filters_config) == 0) {
Expand Down
84 changes: 42 additions & 42 deletions R/dtypes.R
Original file line number Diff line number Diff line change
@@ -1,12 +1,26 @@
# Internal utility functions for converting between Zarr and R data types.

#' @keywords internal
get_dtype_from_array <- function(a) {
RTYPE_DTYPE_MAPPING <- list(
"logical" = "|b1",
"integer" = "<i4",
"double" = "<f8",
"character" = "|S8" # TODO: how many bytes to use here?
)
rtype_str <- typeof(a)
return(RTYPE_DTYPE_MAPPING[[rtype_str]])
}

#' @keywords internal
is_structured_dtype <- function(dtype) {
if(is.character(dtype) && length(dtype) == 1) {
return(FALSE)
}
return(TRUE)
}

#' @keywords internal
get_dtype_parts <- function(dtype) {
# TODO: support object dtype (without digits required in regex)
dtype_regex <- "^(\\||>|<)(b|i|u|f|c|m|M|S|U|V|O)(\\d+)"
Expand All @@ -32,6 +46,7 @@ get_dtype_parts <- function(dtype) {
}
}

#' @keywords internal
check_dtype_support <- function(dtype_parts) {
if(!is_na(dtype_parts) && dtype_parts$basic_type %in% c("b", "i", "u", "f", "S", "U")) {
return(TRUE)
Expand All @@ -40,12 +55,9 @@ check_dtype_support <- function(dtype_parts) {
return(FALSE)
}



#' @keywords internal
get_dtype_rtype <- function(dtype) {
dtype_parts <- get_dtype_parts(dtype)
check_dtype_support(dtype_parts)

# Reference: https://github.com/gzuidhof/zarr.js/blob/292804/src/nestedArray/types.ts#L32
BASICTYPE_RTYPE_MAPPING <- list(
Expand All @@ -63,7 +75,6 @@ get_dtype_rtype <- function(dtype) {
#' @keywords internal
get_dtype_endianness <- function(dtype) {
dtype_parts <- get_dtype_parts(dtype)
check_dtype_support(dtype_parts)

DTYPE_ENDIANNESS_MAPPING <- list(
"|" = "nr",
Expand All @@ -76,14 +87,12 @@ get_dtype_endianness <- function(dtype) {
#' @keywords internal
get_dtype_numbytes <- function(dtype) {
dtype_parts <- get_dtype_parts(dtype)
check_dtype_support(dtype_parts)
return(dtype_parts$num_bytes)
}

#' @keywords internal
get_dtype_signed <- function(dtype) {
dtype_parts <- get_dtype_parts(dtype)
check_dtype_support(dtype_parts)

DTYPE_SIGNED_MAPPING <- list(
"b" = FALSE,
Expand All @@ -96,18 +105,6 @@ get_dtype_signed <- function(dtype) {
return(DTYPE_SIGNED_MAPPING[[dtype_parts$basic_type]])
}

#' @keywords internal
get_dtype_from_array <- function(a) {
RTYPE_DTYPE_MAPPING <- list(
"logical" = "|b1",
"integer" = "<i4",
"double" = "<f8",
"character" = "|S8" # TODO: how many bytes to use here?
)
rtype_str <- typeof(a)
return(RTYPE_DTYPE_MAPPING[[rtype_str]])
}

#' @keywords internal
get_dtype_asrtype <- function(dtype) {
dtype_parts <- get_dtype_parts(dtype)
Expand All @@ -131,6 +128,8 @@ get_typed_array_ctr <- function(dtype) {
return(function(dim) array(data = rtype, dim = dim))
}

# Reference: https://numpy.org/doc/stable/reference/arrays.dtypes.html

#' The Zarr Dtype class.
#' @title Dtype Class
#' @docType class
Expand All @@ -140,28 +139,46 @@ get_typed_array_ctr <- function(dtype) {
#' @keywords internal
Dtype <- R6::R6Class("Dtype",
public = list(
#' @field dtype The original dtype string, like "<f4".
dtype = NULL,
#' @field byte_order The byte order of the dtype, either "little", "big", or "nr".
byte_order = NULL,
#' @field basic_type The basic type of the dtype, like "f".
basic_type = NULL,
#' @field num_bytes The number of bytes of the dtype.
num_bytes = NULL,
#' @field num_items The number of items of the dtype.
num_items = NULL,
#' @field is_signed Whether the dtype is signed. Logical/boolean.
is_signed = NULL,
#' @field is_structured Whether the dtype is structured. Logical/boolean.
is_structured = NULL,
#' @field is_object Whether the dtype is an object. Logical/boolean.
is_object = NULL,
#' @field object_codec The object codec instance.
object_codec = NULL,
#' @description
#' Create a new Dtype instance.
#' @param dtype The original dtype string, like "<f4".
#' @param object_codec The object codec instance.
#' @return A `Dtype` instance.
initialize = function(dtype, object_codec) {
initialize = function(dtype, object_codec = NA) {
self$dtype <- dtype

# TODO: support dtype_str == "|O" for object dtypes
# TODO: support dtype_str == "|O" for object dtypes / dont require numeric part of dtype string

dtype_parts <- get_dtype_parts(dtype)
check_dtype_support(dtype_parts)
self$byte_order <- dtype_parts$byte_order
self$byte_order <- get_dtype_endianness(dtype)
self$basic_type <- dtype_parts$basic_type
self$num_bytes <- dtype_parts$num_bytes
self$num_items <- dtype_parts$num_items

self$is_signed <- get_dtype_signed(dtype)
self$is_structured <- is_structured_dtype(dtype)
self$is_object <- (self$basic_type == "O")

# TODO: port code from normalize_dtype in zarr-python

self$object_codec <- object_codec
},
get_asrtype = function() {
Expand All @@ -170,26 +187,9 @@ Dtype <- R6::R6Class("Dtype",
get_rtype = function() {
return(get_dtype_rtype(self$dtype))
},
is_signed = function() {
return(get_dtype_signed(self$dtype))
},
is_structured = function() {
return(is_structured_dtype(self$dtype))
},
is_object = function() {
return(self$basic_type == "O")
},
get_byte_order = function() {
return(self$byte_order)
},
get_basic_type = function() {
return(self$basic_type)
},
get_num_bytes = function() {
return(self$num_bytes)
},
get_num_items = function() {
return(self$num_items)
get_typed_array_ctr = function() {
rtype <- self$get_rtype()
return(function(dim) array(data = rtype, dim = dim))
}
)
)
Loading

0 comments on commit 988a76d

Please sign in to comment.