diff --git a/rdata/_write.py b/rdata/_write.py index 39a8255..696546f 100644 --- a/rdata/_write.py +++ b/rdata/_write.py @@ -1,17 +1,23 @@ """Functions to perform conversion and unparsing in one step.""" + from __future__ import annotations from typing import TYPE_CHECKING -from .conversion import build_r_data, convert_to_r_object, convert_to_r_object_for_rda -from .conversion.to_r import DEFAULT_FORMAT_VERSION +from .conversion import ( + convert_python_to_r_data, +) +from .conversion.to_r import ( + DEFAULT_CLASS_MAP, + DEFAULT_FORMAT_VERSION, +) from .unparser import unparse_file if TYPE_CHECKING: import os from typing import Any - from .conversion.to_r import Encoding + from .conversion.to_r import ConstructorDict, Encoding from .unparser import Compression, FileFormat @@ -23,14 +29,12 @@ def write_rds( compression: Compression = "gzip", encoding: Encoding = "utf-8", format_version: int = DEFAULT_FORMAT_VERSION, + constructor_dict: ConstructorDict = DEFAULT_CLASS_MAP, ) -> None: """ Write an RDS file. - This is a convenience function that wraps - :func:`rdata.conversion.convert_to_r_object`, - :func:`rdata.conversion.build_r_data`, - and :func:`rdata.unparser.unparse_file`, + This is a convenience function that wraps conversion and unparsing as it is the common use case. Args: @@ -40,6 +44,8 @@ def write_rds( compression: Compression. encoding: Encoding to be used for strings within data. format_version: File format version. + constructor_dict: Dictionary mapping Python classes to + functions converting them to R classes. See Also: :func:`write_rda`: Similar function that writes an RDA or RDATA file. @@ -52,15 +58,13 @@ def write_rds( >>> data = ["hello", 1, 2.2, 3.3+4.4j] >>> rdata.write_rds("test.rds", data) """ - r_object = convert_to_r_object( + r_data = convert_python_to_r_data( data, encoding=encoding, - ) - r_data = build_r_data( - r_object, - encoding=encoding, format_version=format_version, + constructor_dict=constructor_dict, ) + unparse_file( path, r_data, @@ -78,14 +82,12 @@ def write_rda( compression: Compression = "gzip", encoding: Encoding = "utf-8", format_version: int = DEFAULT_FORMAT_VERSION, + constructor_dict: ConstructorDict = DEFAULT_CLASS_MAP, ) -> None: """ Write an RDA or RDATA file. - This is a convenience function that wraps - :func:`rdata.conversion.convert_to_r_object_for_rda`, - :func:`rdata.conversion.build_r_data`, - and :func:`rdata.unparser.unparse_file`, + This is a convenience function that wraps conversion and unparsing as it is the common use case. Args: @@ -95,6 +97,8 @@ def write_rda( compression: Compression. encoding: Encoding to be used for strings within data. format_version: File format version. + constructor_dict: Dictionary mapping Python classes to + functions converting them to R classes. See Also: :func:`write_rds`: Similar function that writes an RDS file. @@ -107,15 +111,14 @@ def write_rda( >>> data = {"name": "hello", "values": [1, 2.2, 3.3+4.4j]} >>> rdata.write_rda("test.rda", data) """ - r_object = convert_to_r_object_for_rda( + r_data = convert_python_to_r_data( data, encoding=encoding, - ) - r_data = build_r_data( - r_object, - encoding=encoding, format_version=format_version, + constructor_dict=constructor_dict, + file_type="rda", ) + unparse_file( path, r_data, diff --git a/rdata/conversion/__init__.py b/rdata/conversion/__init__.py index 44f6ad7..2d5a0ec 100644 --- a/rdata/conversion/__init__.py +++ b/rdata/conversion/__init__.py @@ -1,4 +1,5 @@ -"""Utilities for converting R objects to Python ones.""" +"""Utilities for converting between R and Python objects.""" + from ._conversion import ( DEFAULT_CLASS_MAP as DEFAULT_CLASS_MAP, Converter as Converter, @@ -25,7 +26,7 @@ ts_constructor as ts_constructor, ) from .to_r import ( - build_r_data as build_r_data, - convert_to_r_object as convert_to_r_object, - convert_to_r_object_for_rda as convert_to_r_object_for_rda, + ConverterFromPythonToR as ConverterFromPythonToR, + convert_python_to_r_data as convert_python_to_r_data, + convert_python_to_r_object as convert_python_to_r_object, ) diff --git a/rdata/conversion/_conversion.py b/rdata/conversion/_conversion.py index 7ad0957..dcd5305 100644 --- a/rdata/conversion/_conversion.py +++ b/rdata/conversion/_conversion.py @@ -394,20 +394,38 @@ def convert_array( return value # type: ignore [no-any-return] -R_INT_MIN = -2**31 - - def _dataframe_column_transform(source: Any) -> Any: # noqa: ANN401 if isinstance(source, np.ndarray): + dtype: Any if np.issubdtype(source.dtype, np.integer): - return pd.Series(source, dtype=pd.Int32Dtype()).array - - if np.issubdtype(source.dtype, np.bool_): - return pd.Series(source, dtype=pd.BooleanDtype()).array + dtype = pd.Int32Dtype() + elif np.issubdtype(source.dtype, np.floating): + # We return the numpy array here, which keeps + # R_FLOAT_NA, np.nan, and other NaNs as they were originally in the file. + # Users can then decide if they prefer to interpret + # only R_FLOAT_NA or all NaNs as "missing". + return source + # This would create an array with all NaNs as "missing": + # dtype = pd.Float64Dtype() # noqa: ERA001 + # This would create an array with only R_FLOAT_NA as "missing": + # from rdata.missing import is_na # noqa: ERA001 + # return pd.arrays.FloatingArray(source, is_na(source)) # noqa: ERA001 + elif np.issubdtype(source.dtype, np.complexfloating): + # There seems to be no pandas type for complex array + return source + elif np.issubdtype(source.dtype, np.bool_): + dtype = pd.BooleanDtype() + elif np.issubdtype(source.dtype, np.str_): + dtype = pd.StringDtype() + elif np.issubdtype(source.dtype, np.object_): + for value in source: + assert isinstance(value, str) or value is None + dtype = pd.StringDtype() + else: + return source - if np.issubdtype(source.dtype, np.str_): - return pd.Series(source, dtype=pd.StringDtype()).array + return pd.Series(source, dtype=dtype).array return source @@ -430,7 +448,7 @@ def dataframe_constructor( and isinstance(row_names, np.ma.MaskedArray) and row_names.mask[0] ) - else tuple(row_names) + else row_names ) return pd.DataFrame(obj, columns=obj, index=index) diff --git a/rdata/conversion/to_r.py b/rdata/conversion/to_r.py index 39df39e..9ec55d3 100644 --- a/rdata/conversion/to_r.py +++ b/rdata/conversion/to_r.py @@ -7,7 +7,9 @@ from typing import TYPE_CHECKING import numpy as np +import pandas as pd +from rdata.missing import R_FLOAT_NA, R_INT_NA from rdata.parser import ( CharFlags, RData, @@ -24,18 +26,31 @@ ) if TYPE_CHECKING: - from collections.abc import Mapping + from collections.abc import Callable, Mapping from typing import Any, Final, Literal, Protocol - Encoding = Literal["utf-8", "cp1252"] + import numpy.typing as npt + + from rdata.unparser import FileType + Encoding = Literal["utf-8", "cp1252"] class Converter(Protocol): - """Protocol for Py-to-R conversion.""" + """Protocol for class converting Python objects to R objects.""" + + format_version: int + + def convert_to_r_attributes(self, data: dict[str, Any]) -> RObject: + """Convert dictionary to R attributes list.""" - def __call__(self, data: Any, *, encoding: Encoding) -> RObject: # noqa: ANN401 - """Convert Python object to R object.""" + def convert_to_r_sym(self, name: str) -> RObject: + """Convert string to R symbol.""" + def convert_to_r_object(self, data: Any) -> RObject: # noqa: ANN401 + """Convert Python data to R object.""" + + ConstructorFunction = Callable[[Any, Converter], RObject] + ConstructorDict = Mapping[type, ConstructorFunction] # Default values for RVersions object @@ -47,15 +62,231 @@ def __call__(self, data: Any, *, encoding: Encoding) -> RObject: # noqa: ANN401 2: 0x20300, 3: 0x30500, }) +R_MINIMUM_VERSION_WITH_ENCODING: Final[int] = 3 +R_MINIMUM_VERSION_WITH_ALTREP: Final[int] = 3 + + +def categorical_constructor( + data: pd.Categorical, + converter: Converter, +) -> RObject: + """ + Construct R object components from pandas categorical. + + Args: + data: Pandas categorical. + converter: Python-to-R converter. + + Returns: + Components of the R object. + """ + r_attributes = converter.convert_to_r_attributes({ + "levels": data.categories.to_numpy(), + "class": "factor", + }) + + return build_r_object( + RObjectType.INT, + value=data.codes + 1, + is_object=True, + attributes=r_attributes, + ) + + +def dataframe_constructor( + data: pd.DataFrame, + converter: Converter, +) -> RObject: + """ + Construct R object components from pandas dataframe. + + Args: + data: Pandas dataframe. + converter: Python-to-R converter. + + Returns: + Components of the R object. + """ + column_names = [] + r_value = [] + for column, series in data.items(): + assert isinstance(column, str) + column_names.append(column) + + pd_array = series.array + array: pd.Categorical | npt.NDArray[Any] + if isinstance(pd_array, pd.Categorical): + array = pd_array + else: + array = convert_pd_array_to_np_array(pd_array) + r_series = converter.convert_to_r_object(array) + r_value.append(r_series) + + index = data.index + if isinstance(index, pd.RangeIndex): + assert isinstance(index.start, int) + if index.start == 1 and index.stop == data.shape[0] + 1 and index.step == 1: + # Construct default row names stored as [R_INT_NA, -len] + row_names = np.ma.array( + data=[R_INT_NA, -data.shape[0]], + mask=[True, False], + fill_value=R_INT_NA, + ) + else: + row_names = index + elif isinstance(index, pd.Index): + if index.dtype == "object" or np.issubdtype(str(index.dtype), np.integer): + row_names = index.to_numpy() + else: + msg = f"pd.DataFrame pd.Index {index.dtype} not implemented" + raise NotImplementedError(msg) + else: + msg = f"pd.DataFrame index {type(index)} not implemented" + raise NotImplementedError(msg) + + r_attributes = converter.convert_to_r_attributes({ + "names": np.array(column_names, dtype=np.dtype("U")), + "class": "data.frame", + "row.names": row_names, + }) + + return build_r_object( + RObjectType.VEC, + value=r_value, + is_object=True, + attributes=r_attributes, + ) + + +def rangeindex_constructor( + data: pd.RangeIndex, + converter: Converter, +) -> RObject: + """ + Construct R object components from pandas rangeindex. + + Args: + data: Pandas rangeindex. + converter: Python-to-R converter. + + Returns: + Components of the R object. + """ + if converter.format_version < R_MINIMUM_VERSION_WITH_ALTREP: + # ALTREP support is from R version 3.5.0 + # (minimum version for format version 3) + return build_r_object( + RObjectType.INT, + value=np.array(data), + ) + + assert isinstance(data.step, int) + if data.step != 1: + # R supports compact sequences only with step 1; + # convert the range to an array of values + return build_r_object( + RObjectType.INT, + value=np.array(data), + ) + + r_value = ( + build_r_list([ + converter.convert_to_r_sym("compact_intseq"), + converter.convert_to_r_sym("base"), + converter.convert_to_r_object(RObjectType.INT.value), + ]), + converter.convert_to_r_object(np.array([ + len(data), + data.start, + data.step, + ], dtype=float)), + converter.convert_to_r_object(None), + ) + + return build_r_object( + RObjectType.ALTREP, + value=r_value, + ) + + +DEFAULT_CLASS_MAP: Final[ConstructorDict] = MappingProxyType({ + pd.Categorical: categorical_constructor, + pd.DataFrame: dataframe_constructor, + pd.RangeIndex: rangeindex_constructor, +}) + + +def convert_pd_array_to_np_array( + pd_array: pd.api.extensions.ExtensionArray, +) -> npt.NDArray[Any]: + """ + Convert pandas array object to numpy array. + + Args: + pd_array: Pandas array. + + Returns: + Numpy array. + """ + if isinstance(pd_array, pd.arrays.StringArray): + return pd_array.to_numpy() + + if isinstance(pd_array, ( + pd.arrays.BooleanArray, + pd.arrays.IntegerArray, + )): + dtype: type[Any] + fill_value: bool | int + if isinstance(pd_array, pd.arrays.BooleanArray): + dtype = np.bool_ + fill_value = True + elif isinstance(pd_array, pd.arrays.IntegerArray): + dtype = np.int32 + fill_value = R_INT_NA + + mask = pd_array.isna() # type: ignore [no-untyped-call] + if np.any(mask): + data = pd_array.to_numpy(dtype=dtype, na_value=fill_value) + array = np.ma.array( # type: ignore [no-untyped-call] + data=data, + mask=mask, + fill_value=fill_value, + ) + else: + array = pd_array.to_numpy() + assert array.dtype == dtype + assert isinstance(array, np.ndarray) # for mypy + return array + + if isinstance(pd_array, ( + pd.arrays.FloatingArray, # type: ignore [attr-defined] + )): + # Note that this possibly maps all NaNs (not only R_FLOAT_NA) + # to the same `na_value` depending on how the array was built: + array = pd_array.to_numpy(dtype=np.float64, na_value=R_FLOAT_NA) + assert isinstance(array, np.ndarray) # for mypy + return array + + if isinstance(pd_array, ( + pd.arrays.NumpyExtensionArray, # type: ignore [attr-defined] + )): + array = pd_array.to_numpy() + assert isinstance(array, np.ndarray) # for mypy + return array + + msg = f"pandas array {type(array)} not implemented" + raise NotImplementedError(msg) def build_r_object( - r_type: RObjectType, - *, - value: Any = None, # noqa: ANN401 - attributes: RObject | None = None, - tag: RObject | None = None, - gp: int = 0, + r_type: RObjectType, + *, + value: Any = None, # noqa: ANN401 + is_object: bool = False, + attributes: RObject | None = None, + tag: RObject | None = None, + gp: int = 0, + reference: tuple[int, RObject | None] = (0, None), ) -> RObject: """ Build R object. @@ -63,9 +294,11 @@ def build_r_object( Args: r_type: Type indentifier. value: Value for RObject. + is_object: True if RObject represents object. attributes: Same as in RObject. tag: Same as in RObject. gp: Same as in RObjectInfo. + reference: Tuple of integer and object. Returns: R object. @@ -75,153 +308,386 @@ def build_r_object( RObjectInfo """ assert r_type is not None + reference_id, referenced_object = reference + assert ( + (reference_id == 0) + == (referenced_object is None) + == (r_type != RObjectType.REF) + ) return RObject( RObjectInfo( r_type, - object=False, + object=is_object, attributes=attributes is not None, tag=tag is not None, gp=gp, - reference=0, - ), - value, - attributes, - tag, - None, - ) + reference=reference_id, + ), + value, + attributes, + tag, + referenced_object, + ) def build_r_list( - data: Mapping[str, Any] | list[Any], - *, - encoding: Encoding, - convert_value: Converter | None = None, + data: list[RObject] | list[tuple[RObject, RObject]], ) -> RObject: """ - Build R object representing named linked list. + Build R object representing (named) linked list. Args: - data: Non-empty dictionary or list. - encoding: Encoding to be used for strings within data. - convert_value: Function used for converting value to R object - (for example, convert_to_r_object). + data: Non-empty list of values or (key, value) pairs. Returns: R object. """ - if convert_value is None: - convert_value = convert_to_r_object - if len(data) == 0: msg = "data must not be empty" raise ValueError(msg) - if isinstance(data, dict): - data = data.copy() - key = next(iter(data)) - value1 = convert_value(data.pop(key), encoding=encoding) - tag = build_r_sym(key, encoding=encoding) - elif isinstance(data, list): - value1 = convert_value(data[0], encoding=encoding) - data = data[1:] + head = data[0] + tail = data[1:] + if isinstance(head, tuple): + tag, car = head + else: tag = None + car = head - if len(data) == 0: - value2 = build_r_object(RObjectType.NILVALUE) - else: - value2 = build_r_list(data, encoding=encoding, convert_value=convert_value) + cdr = build_r_object(RObjectType.NILVALUE) if len(tail) == 0 else build_r_list(tail) - return build_r_object( - RObjectType.LIST, - value=(value1, value2), - tag=tag, - ) + return build_r_object(RObjectType.LIST, value=(car, cdr), tag=tag) -def build_r_sym( - data: str, - *, - encoding: Encoding, +def build_r_char( + data: str | bytes | None, + *, + encoding: Encoding, ) -> RObject: """ - Build R object representing symbol. + Build R object representing characters. Args: - data: String. - encoding: Encoding to be used for strings within data. + data: String or bytestring. + encoding: Encoding used for strings. Returns: R object. """ - r_type = RObjectType.SYM - r_value = convert_to_r_object(data.encode(encoding), encoding=encoding) - return build_r_object(r_type, value=r_value) + if data is None: + return build_r_object(RObjectType.CHAR) + + if isinstance(data, str): + data = data.encode(encoding) + + if all(chr(byte) in string.printable for byte in data): + gp = CharFlags.ASCII + elif encoding == "utf-8": + gp = CharFlags.UTF8 + elif encoding == "cp1252": + # Note! + # CP1252 and Latin1 are not the same. + # Does CharFlags.LATIN1 mean actually CP1252 + # as R on Windows mentions CP1252 as encoding? + # Or does CP1252 change to e.g. CP1250 depending on localization? + gp = CharFlags.LATIN1 + else: + msg = f"unsupported encoding: {encoding}" + raise ValueError(msg) + return build_r_object(RObjectType.CHAR, value=data, gp=gp) -def build_r_data( - r_object: RObject, - *, - encoding: Encoding = "utf-8", - format_version: int = DEFAULT_FORMAT_VERSION, - r_version_serialized: int = DEFAULT_R_VERSION_SERIALIZED, -) -> RData: +class ConverterFromPythonToR: """ - Build RData object from R object. + Class converting Python objects to R objects. - Args: - r_object: R object. + Attributes: encoding: Encoding to be used for strings within data. format_version: File format version. r_version_serialized: R version written as the creator of the object. + constructor_dict: Dictionary mapping Python classes to + functions converting them to R classes. + """ - Returns: - Corresponding RData object. + def __init__( + self, + *, + encoding: Encoding = "utf-8", + format_version: int = DEFAULT_FORMAT_VERSION, + r_version_serialized: int = DEFAULT_R_VERSION_SERIALIZED, + constructor_dict: ConstructorDict = DEFAULT_CLASS_MAP, + ) -> None: + """ + Init class. + + Args: + encoding: Encoding to be used for strings within data. + format_version: File format version. + r_version_serialized: R version written as the creator of the object. + constructor_dict: Dictionary mapping Python classes to + functions converting them to R classes. + """ + self.encoding = encoding + self.format_version = format_version + self.r_version_serialized = r_version_serialized + self.constructor_dict = constructor_dict + self._references: dict[str | None, tuple[int, RObject | None]] = { + None: (0, None), + } + + def convert_to_r_data( + self, + data: Any, # noqa: ANN401 + *, + file_type: FileType = "rds", + ) -> RData: + """ + Convert Python data to R data. + + Args: + data: Any Python object. + file_type: File type. + + Returns: + Corresponding RData object. + + See Also: + convert_to_r_object + """ + if file_type == "rda": + if not isinstance(data, dict): + msg = f"for RDA file, data must be a dictionary, not type {type(data)}" + raise TypeError(msg) + if not all(isinstance(key, str) for key in data): + msg = "for RDA file, dictionary keys must be strings" + raise ValueError(msg) + r_object = self.convert_to_r_attributes(data) + else: + r_object = self.convert_to_r_object(data) - See Also: - convert_to_r_object - """ - versions = RVersions( - format_version, - r_version_serialized, - R_MINIMUM_VERSIONS[format_version], - ) + versions = RVersions( + self.format_version, + self.r_version_serialized, + R_MINIMUM_VERSIONS[self.format_version], + ) - minimum_version_with_encoding = 3 - extra = (RExtraInfo(encoding.upper()) - if versions.format >= minimum_version_with_encoding - else RExtraInfo(None)) + extra = ( + RExtraInfo(self.encoding.upper()) + if versions.format >= R_MINIMUM_VERSION_WITH_ENCODING + else RExtraInfo(None) + ) - return RData(versions, extra, r_object) + return RData(versions, extra, r_object) + + def convert_to_r_attributes( + self, + data: dict[str, Any], + ) -> RObject: + """ + Convert dictionary to R attributes list. + + Args: + data: Non-empty dictionary. + + Returns: + R object. + """ + converted = [] + for key, value in data.items(): + converted.append(( + self.convert_to_r_sym(key), + self.convert_to_r_object(value), + )) + + return build_r_list(converted) + + def convert_to_r_sym( + self, + name: str, + ) -> RObject: + """ + Convert string to R symbol. + + Args: + name: String. + + Returns: + R object. + """ + # Reference to existing symbol if exists + if name in self._references: + reference = self._references[name] + return build_r_object(RObjectType.REF, reference=reference) + + # Create a new symbol + r_value = self.convert_to_r_object(name.encode(self.encoding)) + r_object = build_r_object(RObjectType.SYM, value=r_value) + + # Add to reference list + self._references[name] = (len(self._references), r_object) + return r_object + + def convert_to_r_object( # noqa: C901, PLR0912, PLR0915 + self, + data: Any, # noqa: ANN401 + ) -> RObject: + """ + Convert Python data to R object. + + Args: + data: Any Python object. + + Returns: + Corresponding R object. + """ + # Default args for most types (None/False/0) + r_type = None + values: list[Any] | tuple[Any, ...] + r_value: Any = None + is_object = False + attributes: dict[str, Any] | None = None + + if data is None: + r_type = RObjectType.NILVALUE + + elif isinstance(data, RExpression): + r_type = RObjectType.EXPR + r_value = [self.convert_to_r_object(el) for el in data.elements] + + elif isinstance(data, RLanguage): + r_type = RObjectType.LANG + symbols = [self.convert_to_r_sym(el) for el in data.elements] + r_value = (symbols[0], build_r_list(symbols[1:])) + + if len(data.attributes) > 0: + # The following might work here (untested) + # attributes = data.attributes # noqa: ERA001 + msg = f"type {r_type} with attributes not implemented" + raise NotImplementedError(msg) + elif isinstance(data, (list, tuple, dict)): + r_type = RObjectType.VEC + values = list(data.values()) if isinstance(data, dict) else data + r_value = [self.convert_to_r_object(el) for el in values] -def convert_to_r_object_for_rda( - data: Mapping[str, Any], - *, - encoding: Encoding = "utf-8", -) -> RObject: + if isinstance(data, dict): + if not all(isinstance(key, str) for key in data): + msg = "dictionary keys must be strings" + raise ValueError(msg) + names = np.array(list(data.keys()), dtype=np.dtype("U")) + attributes = {"names": names} + + elif isinstance(data, np.ndarray): + # Promote 0-dimensional array to 1-dimensional array + if data.ndim == 0: + data = data[np.newaxis] + + if data.dtype.kind in ["O"]: + assert data.ndim == 1 + r_type = RObjectType.STR + r_value = [] + for el in data: + if el is None or pd.isna(el): + r_el = build_r_char(None, encoding=self.encoding) + elif isinstance(el, str): + r_el = build_r_char(el, encoding=self.encoding) + else: + msg = "general object array not implemented" + raise NotImplementedError(msg) + r_value.append(r_el) + + elif data.dtype.kind in ["S"]: # bytes object is converted to this dtype + assert data.size == 1 + return build_r_char(data[0], encoding=self.encoding) + + elif data.dtype.kind in ["U"]: + assert data.ndim == 1 + r_type = RObjectType.STR + r_value = [build_r_char(el, encoding=self.encoding) for el in data] + + else: + r_type = { + "b": RObjectType.LGL, + "i": RObjectType.INT, + "f": RObjectType.REAL, + "c": RObjectType.CPLX, + }[data.dtype.kind] + + if data.ndim == 1: + r_value = data + else: + # R uses column-major order like Fortran + r_value = np.ravel(data, order="F") + attributes = {"dim": np.array(data.shape)} + + elif isinstance(data, (bool, int, float, complex, str, bytes)): + return self.convert_to_r_object(np.array(data)) + + else: + # Check available constructors + for t, constructor in self.constructor_dict.items(): + if isinstance(data, t): + return constructor(data, self) + + msg = f"type {type(data)} not implemented" + raise NotImplementedError(msg) + + if attributes is not None: + is_object = "class" in attributes + r_attributes = self.convert_to_r_attributes(attributes) + else: + r_attributes = None + + return build_r_object( + r_type, + value=r_value, + is_object=is_object, + attributes=r_attributes, + ) + + +def convert_python_to_r_data( + data: Any, # noqa: ANN401 + *, + encoding: Encoding = "utf-8", + format_version: int = DEFAULT_FORMAT_VERSION, + r_version_serialized: int = DEFAULT_R_VERSION_SERIALIZED, + constructor_dict: ConstructorDict = DEFAULT_CLASS_MAP, + file_type: FileType = "rds", +) -> RData: """ - Convert Python dictionary to R object for RDA file. + Convert Python data to R data. Args: - data: Python dictionary with data and variable names. + data: Any Python object. encoding: Encoding to be used for strings within data. + format_version: File format version. + r_version_serialized: R version written as the creator of the object. + constructor_dict: Dictionary mapping Python classes to + functions converting them to R classes. + file_type: File type. Returns: - Corresponding R object. + Corresponding RData object. See Also: - convert_to_r_object + convert_python_to_r_object """ - if not isinstance(data, dict): - msg = f"for RDA file, data must be a dictionary, not type {type(data)}" - raise TypeError(msg) - return build_r_list(data, encoding=encoding) - - -def convert_to_r_object( # noqa: C901, PLR0912, PLR0915 - data: Any, # noqa: ANN401 - *, - encoding: Encoding = "utf-8", + return ConverterFromPythonToR( + encoding=encoding, + format_version=format_version, + r_version_serialized=r_version_serialized, + constructor_dict=constructor_dict, + ).convert_to_r_data(data, file_type=file_type) + + +def convert_python_to_r_object( + data: Any, # noqa: ANN401 + *, + encoding: Encoding = "utf-8", + format_version: int = DEFAULT_FORMAT_VERSION, + r_version_serialized: int = DEFAULT_R_VERSION_SERIALIZED, + constructor_dict: ConstructorDict = DEFAULT_CLASS_MAP, ) -> RObject: """ Convert Python data to R object. @@ -229,115 +695,20 @@ def convert_to_r_object( # noqa: C901, PLR0912, PLR0915 Args: data: Any Python object. encoding: Encoding to be used for strings within data. + format_version: File format version. + r_version_serialized: R version written as the creator of the object. + constructor_dict: Dictionary mapping Python classes to + functions converting them to R classes. Returns: - Corresponding R object. + Corresponding RObject object. See Also: - convert_to_r_data + convert_python_to_r_data """ - # Default args for most types (None/False/0) - r_type = None - values: list[Any] | tuple[Any, ...] - r_value: Any = None - gp = 0 - attributes = None - tag = None - - if data is None: - r_type = RObjectType.NILVALUE - - elif isinstance(data, RExpression): - r_type = RObjectType.EXPR - r_value = [convert_to_r_object(el, encoding=encoding) for el in data.elements] - - elif isinstance(data, RLanguage): - r_type = RObjectType.LANG - values = data.elements - r_value = (build_r_sym(str(values[0]), encoding=encoding), - build_r_list(values[1:], encoding=encoding, - convert_value=build_r_sym)) - - if len(data.attributes) > 0: - # The following might work here (untested) - # attributes = build_r_list(data.attributes, encoding=encoding) # noqa: ERA001,E501 - msg = f"type {r_type} with attributes not implemented" - raise NotImplementedError(msg) - - elif isinstance(data, (list, tuple, dict)): - r_type = RObjectType.VEC - values = list(data.values()) if isinstance(data, dict) else data - r_value = [convert_to_r_object(el, encoding=encoding) for el in values] - - if isinstance(data, dict): - names = np.array(list(data.keys()), dtype=np.dtype("U")) - attributes = build_r_list({"names": names}, - encoding=encoding) - - elif isinstance(data, np.ndarray): - if data.dtype.kind in ["O"]: - # This is a special case handling only np.array([None]) - if data.size != 1 or data[0] is not None: - msg = "general object array not implemented" - raise NotImplementedError(msg) - r_type = RObjectType.STR - r_value = [build_r_object(RObjectType.CHAR)] - - elif data.dtype.kind in ["S"]: - assert data.ndim == 1 - r_type = RObjectType.STR - r_value = [convert_to_r_object(el, encoding=encoding) for el in data] - - elif data.dtype.kind in ["U"]: - assert data.ndim == 1 - data = np.array([s.encode(encoding) for s in data], dtype=np.dtype("S")) - return convert_to_r_object(data, encoding=encoding) - - else: - r_type = { - "b": RObjectType.LGL, - "i": RObjectType.INT, - "f": RObjectType.REAL, - "c": RObjectType.CPLX, - }[data.dtype.kind] - - if data.ndim == 0: - r_value = data[np.newaxis] - elif data.ndim == 1: - r_value = data - else: - # R uses column-major order like Fortran - r_value = np.ravel(data, order="F") - attributes = build_r_list({"dim": np.array(data.shape)}, - encoding=encoding) - - elif isinstance(data, (bool, int, float, complex)): - return convert_to_r_object(np.array(data), encoding=encoding) - - elif isinstance(data, str): - r_type = RObjectType.STR - r_value = [convert_to_r_object(data.encode(encoding), encoding=encoding)] - - elif isinstance(data, bytes): - r_type = RObjectType.CHAR - if all(chr(byte) in string.printable for byte in data): - gp = CharFlags.ASCII - elif encoding == "utf-8": - gp = CharFlags.UTF8 - elif encoding == "cp1252": - # Note! - # CP1252 and Latin1 are not the same. - # Does CharFlags.LATIN1 mean actually CP1252 - # as R on Windows mentions CP1252 as encoding? - # Or does CP1252 change to e.g. CP1250 depending on localization? - gp = CharFlags.LATIN1 - else: - msg = f"unsupported encoding: {encoding}" - raise ValueError(msg) - r_value = data - - else: - msg = f"type {type(data)} not implemented" - raise NotImplementedError(msg) - - return build_r_object(r_type, value=r_value, attributes=attributes, tag=tag, gp=gp) + return ConverterFromPythonToR( + encoding=encoding, + format_version=format_version, + r_version_serialized=r_version_serialized, + constructor_dict=constructor_dict, + ).convert_to_r_object(data) diff --git a/rdata/missing.py b/rdata/missing.py new file mode 100644 index 0000000..2cad764 --- /dev/null +++ b/rdata/missing.py @@ -0,0 +1,109 @@ +"""Utilities for missing (NA) values in R.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import numpy as np + +if TYPE_CHECKING: + from typing import Any, Final + + import numpy.typing as npt + + +#: Value used to represent a missing integer in R. +R_INT_NA: Final[int] = np.int32(-2**31) # type: ignore [assignment] + +#: Value used to represent a missing float in R. +# This is a NaN with a particular payload, but it's not the same as np.nan. +R_FLOAT_NA: Final[float] = np.uint64(0x7ff00000000007a2).view(np.float64) # type: ignore [assignment] + + +def get_na_value(dtype: np.dtype[Any]) -> Any: # noqa: ANN401 + """ + Get NA value for a given type. + + Args: + dtype: NumPy dtype. + + Returns: + NA value of given dtype. + """ + if dtype == np.int32: + return R_INT_NA + if dtype == np.float64: + return R_FLOAT_NA + msg = f"NA for numpy dtype {dtype} not implemented" + raise NotImplementedError(msg) + + +def is_na( + array: Any | npt.NDArray[Any], # noqa: ANN401 +) -> bool | npt.NDArray[np.bool_]: + """ + Check if the array elements are NA. + + Args: + array: NumPy array or single value. + + Returns: + Boolean mask of NA values in the array. + """ + if isinstance(array, np.ndarray): + dtype = array.dtype + na = get_na_value(dtype) + if dtype == np.int32: + # Use the native dtype for comparison when possible; + # slightly faster than the steps below + return array == na # type: ignore [no-any-return] + # Convert dtype to unsigned integer to perform byte-by-byte + # equality comparison to distinguish different NaN values + raw_dtype = f"u{array.dtype.itemsize}" + return array.view(raw_dtype) == np.array(na).view(raw_dtype) # type: ignore [no-any-return] + + if isinstance(array, int): + try: + # Python built-in integer is 64 bits or larger, so + # we try to cast it to 32-bit int if possible + return is_na(np.array(array, dtype=np.int32)) + except OverflowError: + # Proceed with larger integer (in case it is supported at some point) + return is_na(np.array(array)) + + if isinstance(array, (float, np.int32, np.float64)): + return is_na(np.array(array)) + + msg = f"NA for {type(array)} not implemented" + raise NotImplementedError(msg) + + +def mask_na_values( + array: npt.NDArray[Any], + *, + fill_value: Any | None = None, # noqa: ANN401 +) -> npt.NDArray[Any] | np.ma.MaskedArray[Any, Any]: + """ + Mask NA elements of the array. + + Args: + array: NumPy array. + fill_value: Fill value for the masked array. + Defaults to the NA value. + + Returns: + NumPy masked array with NA values as the mask + or the original array if there is no NA elements. + """ + mask = is_na(array) + if np.any(mask): + if fill_value is None: + fill_value = get_na_value(array.dtype) + + array[mask] = fill_value + return np.ma.array( # type: ignore [no-untyped-call,no-any-return] + data=array, + mask=mask, + fill_value=fill_value, + ) + return array diff --git a/rdata/parser/__init__.py b/rdata/parser/__init__.py index d62b6e9..48421e6 100644 --- a/rdata/parser/__init__.py +++ b/rdata/parser/__init__.py @@ -2,7 +2,6 @@ from ._parser import ( DEFAULT_ALTREP_MAP as DEFAULT_ALTREP_MAP, - R_INT_NA as R_INT_NA, CharFlags as CharFlags, RData as RData, RExtraInfo as RExtraInfo, diff --git a/rdata/parser/_ascii.py b/rdata/parser/_ascii.py index 22afa7e..a79d2ba 100644 --- a/rdata/parser/_ascii.py +++ b/rdata/parser/_ascii.py @@ -6,7 +6,17 @@ import numpy as np import numpy.typing as npt -from ._parser import R_INT_NA, AltRepConstructorMap, Parser +from rdata.missing import R_FLOAT_NA, R_INT_NA + +from ._parser import AltRepConstructorMap, Parser + + +def map_int_na(line: str) -> int: + return R_INT_NA if line == "NA" else int(line) + + +def map_float_na(line: str) -> float: + return R_FLOAT_NA if line == "NA" else float(line) class ParserASCII(Parser): @@ -30,11 +40,10 @@ def _readline(self) -> str: return self.file.readline()[:-1] def _parse_array_values( - self, - dtype: npt.DTypeLike, - length: int, + self, + dtype: npt.DTypeLike, + length: int, ) -> npt.NDArray[Any]: - array = np.empty(length, dtype=dtype) value: int | float | complex @@ -42,14 +51,16 @@ def _parse_array_values( line = self._readline() if np.issubdtype(dtype, np.integer): - value = R_INT_NA if line == "NA" else int(line) + value = map_int_na(line) elif np.issubdtype(dtype, np.floating): - value = float(line) + value = map_float_na(line) elif np.issubdtype(dtype, np.complexfloating): + value1 = map_float_na(line) line2 = self._readline() - value = complex(float(line), float(line2)) + value2 = map_float_na(line2) + value = complex(value1, value2) else: msg = f"Unknown dtype: {dtype}" diff --git a/rdata/parser/_parser.py b/rdata/parser/_parser.py index 4d4a3ec..8c90486 100644 --- a/rdata/parser/_parser.py +++ b/rdata/parser/_parser.py @@ -23,15 +23,13 @@ import numpy as np import numpy.typing as npt +from rdata.missing import R_INT_NA, mask_na_values + if TYPE_CHECKING: from ._ascii import ParserASCII from ._xdr import ParserXDR -#: Value used to represent a missing integer in R. -R_INT_NA: Final = -2**31 - - @runtime_checkable class BinaryFileLike(Protocol): """Protocol for binary files.""" @@ -371,6 +369,9 @@ def __eq__(self, other: object) -> bool: return False # Compare value field + if not isinstance(other.value, type(self.value)): + return False + if isinstance(self.value, np.ndarray): if not np.array_equal(self.value, other.value, equal_nan=True): return False @@ -540,6 +541,22 @@ def wrap_constructor( return new_info, value +def get_altrep_name(info: RObject) -> bytes: + """Get the name of the ALTREP object.""" + assert info.info.type == RObjectType.LIST + + class_sym = info.value[0] + while class_sym.info.type == RObjectType.REF: + class_sym = class_sym.referenced_object + + assert class_sym.info.type == RObjectType.SYM + assert class_sym.value.info.type == RObjectType.CHAR + + altrep_name = class_sym.value.value + assert isinstance(altrep_name, bytes) + return altrep_name + + default_altrep_map_dict: Final[Mapping[bytes, AltRepConstructor]] = { b"deferred_string": deferred_string_constructor, b"compact_intseq": compact_intseq_constructor, @@ -608,17 +625,7 @@ def parse_nullable_int_array( ) -> npt.NDArray[np.int32] | np.ma.MaskedArray[Any, Any]: """Parse an integer array.""" data = self._parse_array(np.int32) - mask = (data == R_INT_NA) - data[mask] = fill_value - - if np.any(mask): - return np.ma.array( # type: ignore [no-untyped-call,no-any-return] - data=data, - mask=mask, - fill_value=fill_value, - ) - - return data + return mask_na_values(data, fill_value=fill_value) def parse_double_array(self) -> npt.NDArray[np.float64]: """Parse a double array.""" @@ -678,18 +685,7 @@ def expand_altrep_to_object( state: RObject, ) -> tuple[RObjectInfo, Any]: """Expand alternative representation to normal object.""" - assert info.info.type == RObjectType.LIST - - class_sym = info.value[0] - while class_sym.info.type == RObjectType.REF: - class_sym = class_sym.referenced_object - - assert class_sym.info.type == RObjectType.SYM - assert class_sym.value.info.type == RObjectType.CHAR - - altrep_name = class_sym.value.value - assert isinstance(altrep_name, bytes) - + altrep_name = get_altrep_name(info) constructor = self.altrep_constructor_dict[altrep_name] return constructor(state) diff --git a/rdata/parser/_xdr.py b/rdata/parser/_xdr.py index 6d265dd..fe5211a 100644 --- a/rdata/parser/_xdr.py +++ b/rdata/parser/_xdr.py @@ -26,9 +26,9 @@ def __init__( self.file = io.BytesIO(data) def _parse_array_values( - self, - dtype: npt.DTypeLike, - length: int, + self, + dtype: npt.DTypeLike, + length: int, ) -> npt.NDArray[Any]: dtype = np.dtype(dtype) buffer = self.file.read(length * dtype.itemsize) diff --git a/rdata/tests/data/test_dataframe.rda b/rdata/tests/data/test_dataframe.rda index bd83517..61cbf30 100644 Binary files a/rdata/tests/data/test_dataframe.rda and b/rdata/tests/data/test_dataframe.rda differ diff --git a/rdata/tests/data/test_dataframe.rds b/rdata/tests/data/test_dataframe.rds index b5f2382..bdfbdba 100644 Binary files a/rdata/tests/data/test_dataframe.rds and b/rdata/tests/data/test_dataframe.rds differ diff --git a/rdata/tests/data/test_dataframe_dtypes.rds b/rdata/tests/data/test_dataframe_dtypes.rds new file mode 100644 index 0000000..aeb9ffb Binary files /dev/null and b/rdata/tests/data/test_dataframe_dtypes.rds differ diff --git a/rdata/tests/data/test_dataframe_dtypes_with_na.rds b/rdata/tests/data/test_dataframe_dtypes_with_na.rds new file mode 100644 index 0000000..17a170c Binary files /dev/null and b/rdata/tests/data/test_dataframe_dtypes_with_na.rds differ diff --git a/rdata/tests/data/test_dataframe_float_with_na_nan.rds b/rdata/tests/data/test_dataframe_float_with_na_nan.rds new file mode 100644 index 0000000..fed00f4 Binary files /dev/null and b/rdata/tests/data/test_dataframe_float_with_na_nan.rds differ diff --git a/rdata/tests/data/test_dataframe_int_rownames.rds b/rdata/tests/data/test_dataframe_int_rownames.rds new file mode 100644 index 0000000..74772a2 Binary files /dev/null and b/rdata/tests/data/test_dataframe_int_rownames.rds differ diff --git a/rdata/tests/data/test_dataframe_range_rownames.rds b/rdata/tests/data/test_dataframe_range_rownames.rds new file mode 100644 index 0000000..2f7ae99 Binary files /dev/null and b/rdata/tests/data/test_dataframe_range_rownames.rds differ diff --git a/rdata/tests/data/test_dataframe_v3.rda b/rdata/tests/data/test_dataframe_v3.rda index 01e2824..b5955f4 100644 Binary files a/rdata/tests/data/test_dataframe_v3.rda and b/rdata/tests/data/test_dataframe_v3.rda differ diff --git a/rdata/tests/data/test_dataframe_v3.rds b/rdata/tests/data/test_dataframe_v3.rds index 6c2ada7..8e2492d 100644 Binary files a/rdata/tests/data/test_dataframe_v3.rds and b/rdata/tests/data/test_dataframe_v3.rds differ diff --git a/rdata/tests/test_missing.py b/rdata/tests/test_missing.py new file mode 100644 index 0000000..ac3e22b --- /dev/null +++ b/rdata/tests/test_missing.py @@ -0,0 +1,90 @@ +"""Tests of missing value functionality.""" + +from __future__ import annotations + +from typing import Any + +import numpy as np +import pytest + +from rdata.missing import R_FLOAT_NA, R_INT_NA, is_na, mask_na_values + + +def test_int_is_na() -> None: + """Test checking NA values in int array.""" + array = np.array([1, 2, R_INT_NA], dtype=np.int32) + ref_mask = np.array([0, 0, 1], dtype=np.bool_) + + mask = is_na(array) + np.testing.assert_array_equal(mask, ref_mask) + + +def test_float_is_na() -> None: + """Test checking NA values in float array.""" + array = np.array([1, 2, R_FLOAT_NA, np.nan], dtype=np.float64) + ref_mask = np.array([0, 0, 1, 0], dtype=np.bool_) + + mask = is_na(array) + np.testing.assert_array_equal(mask, ref_mask) + + +@pytest.mark.parametrize("value", [R_INT_NA, R_FLOAT_NA]) +def test_value_is_na(value: Any) -> None: # noqa: ANN401 + """Test checking single NA values.""" + assert is_na(value) + + +@pytest.mark.parametrize("value", [ + np.int32(0), 0, np.float64(0.0), 0.0, np.nan, +]) +def test_value_is_not_na(value: Any) -> None: # noqa: ANN401 + """Test checking single NA values.""" + assert not is_na(value) + + +def test_int64() -> None: + """Test checking int64.""" + with pytest.raises(NotImplementedError): + is_na(2**32) + with pytest.raises(NotImplementedError): + is_na(-2**32) + + +def test_wrong_type() -> None: + """Test checking int64.""" + with pytest.raises(NotImplementedError): + is_na("test") + + +def test_masked_array() -> None: + """Test checking masked array creation.""" + array = np.array([1, 2, R_FLOAT_NA, np.nan], dtype=np.float64) + ref_mask = np.array([0, 0, 1, 0], dtype=np.bool_) + ref_data = array.copy() + + masked = mask_na_values(array) + assert isinstance(masked, np.ma.MaskedArray) + np.testing.assert_array_equal(masked.data, ref_data) + np.testing.assert_array_equal(masked.mask, ref_mask) + + +def test_masked_array_fill() -> None: + """Test checking masked array creation.""" + array = np.array([1, 2, R_FLOAT_NA, np.nan], dtype=np.float64) + ref_mask = np.array([0, 0, 1, 0], dtype=np.bool_) + ref_data = array.copy() + ref_data[ref_mask] = 42 + + masked = mask_na_values(array, fill_value=42) + assert isinstance(masked, np.ma.MaskedArray) + np.testing.assert_array_equal(masked.data, ref_data) + np.testing.assert_array_equal(masked.mask, ref_mask) + + +def test_nonmasked_array() -> None: + """Test checking masked array no-op.""" + array = np.array([1, 2, np.nan, np.nan], dtype=np.float64) + + masked = mask_na_values(array) + assert not isinstance(masked, np.ma.MaskedArray) + np.testing.assert_array_equal(masked, array) diff --git a/rdata/tests/test_rdata.py b/rdata/tests/test_rdata.py index c1a5efa..d06df71 100644 --- a/rdata/tests/test_rdata.py +++ b/rdata/tests/test_rdata.py @@ -13,6 +13,7 @@ import xarray import rdata +from rdata.missing import R_FLOAT_NA TESTDATA_PATH = rdata.TESTDATA_PATH @@ -453,6 +454,9 @@ def test_encodings_v3(self) -> None: def test_dataframe(self) -> None: """Test dataframe conversion.""" + # Files created in R with + # test_dataframe = data.frame(class=factor(c("a", "b", "b")), value=c(1L, 2L, 3L)); save(test_dataframe, file="test_dataframe.rda", version=2) # noqa: E501 + # test_dataframe = data.frame(class=factor(c("a", "b", "b")), value=c(1L, 2L, 3L)); save(test_dataframe, file="test_dataframe_v3.rda") # noqa: E501 for f in ("test_dataframe.rda", "test_dataframe_v3.rda"): with self.subTest(file=f): data = rdata.read_rda(TESTDATA_PATH / f) @@ -475,6 +479,9 @@ def test_dataframe(self) -> None: def test_dataframe_rds(self) -> None: """Test dataframe conversion.""" + # Files created in R with + # df = data.frame(class=factor(c("a", "b", "b")), value=c(1L, 2L, 3L)); saveRDS(df, file="test_dataframe.rds", version=2) # noqa: E501 + # df = data.frame(class=factor(c("a", "b", "b")), value=c(1L, 2L, 3L)); saveRDS(df, file="test_dataframe_v3.rds") # noqa: E501 for f in ("test_dataframe.rds", "test_dataframe_v3.rds"): with self.subTest(file=f): data = rdata.read_rds(TESTDATA_PATH / f) @@ -515,6 +522,118 @@ def test_dataframe_rownames(self) -> None: ), ) + def test_dataframe_int_rownames(self) -> None: + """Test dataframe conversion.""" + # File created in R with + # df = data.frame(col1=c(10, 20, 30), row.names=c(3L, 6L, 9L)); saveRDS(df, file="test_dataframe_int_rownames.rds") # noqa: E501 + data = rdata.read_rds(TESTDATA_PATH / "test_dataframe_int_rownames.rds") + + index = np.array([3, 6, 9], dtype=np.int32) + ref = pd.DataFrame( + { + "col1": pd.Series( + [10., 20., 30.], + dtype=float, index=index), + }, + index=index, + ) + pd.testing.assert_frame_equal(data, ref) + + def test_dataframe_range_rownames(self) -> None: + """Test dataframe conversion.""" + # File created in R with + # df = data.frame(col1=c(10, 20, 30), row.names=2:4); saveRDS(df, file="test_dataframe_range_rownames.rds") # noqa: E501 + data = rdata.read_rds(TESTDATA_PATH / "test_dataframe_range_rownames.rds") + + index = pd.RangeIndex(2, 5) + ref = pd.DataFrame( + { + "col1": pd.Series( + [10., 20., 30.], + dtype=float, index=index), + }, + index=index, + ) + pd.testing.assert_frame_equal(data, ref) + + def test_dataframe_dtypes(self) -> None: + """Test dataframe conversion.""" + # File created in R with + # df = data.frame(int=c(10L, 20L, 30L), float=c(1.1, 2.2, 3.3), string=c("x", "y", "z"), bool=as.logical(c(1, 0, 1)), complex=c(4+5i, 6+7i, 8+9i)); print(df); saveRDS(df, file="test_dataframe_dtypes.rds") # noqa: E501 + data = rdata.read_rds(TESTDATA_PATH / "test_dataframe_dtypes.rds") + + index = pd.RangeIndex(1, 4) + ref = pd.DataFrame( + { + "int": pd.Series( + [10, 20, 30], + dtype=pd.Int32Dtype(), index=index), + "float": pd.Series( + [1.1, 2.2, 3.3], + dtype=float, index=index), + "string": pd.Series( + ["x" ,"y", "z"], + dtype=pd.StringDtype(), index=index), + "bool": pd.Series( + [True, False, True], + dtype=pd.BooleanDtype(), index=index), + "complex": pd.Series( + [4+5j, 6+7j, 8+9j], + dtype=complex, index=index), + }, + index=index, + ) + pd.testing.assert_frame_equal(data, ref) + + def test_dataframe_dtypes_with_na(self) -> None: + """Test dataframe conversion.""" + # File created in R with + # df = data.frame(int=c(10L, 20L, 30L, NA), float=c(1.1, 2.2, 3.3, NA), string=c("x", "y", "z", NA), bool=as.logical(c(1, 0, 1, NA)), complex=c(4+5i, 6+7i, 8+9i, NA)); saveRDS(df, file="test_dataframe_dtypes_with_na.rds") # noqa: E501 + data = rdata.read_rds(TESTDATA_PATH / "test_dataframe_dtypes_with_na.rds") + + index = pd.RangeIndex(1, 5) + ref = pd.DataFrame( + { + "int": pd.Series( + [10, 20, 30, pd.NA], + dtype=pd.Int32Dtype(), index=index), + "float": pd.Series( + [1.1, 2.2, 3.3, R_FLOAT_NA], + dtype=float, index=index), + "string": pd.Series( + ["x" ,"y", "z", pd.NA], + dtype=pd.StringDtype(), index=index), + "bool": pd.Series( + [True, False, True, pd.NA], + dtype=pd.BooleanDtype(), index=index), + "complex": pd.Series( + [4+5j, 6+7j, 8+9j, R_FLOAT_NA], + dtype=complex, index=index), + }, + index=index, + ) + + with np.errstate(invalid="ignore"): + # Comparing complex arrays with R_FLOAT_NA gives warning + pd.testing.assert_frame_equal(data, ref) + + def test_dataframe_float_with_na_nan(self) -> None: + """Test dataframe conversion.""" + # File created in R with + # df = data.frame(float=c(1.1, 2.2, 3.3, NA, NaN, Inf, -Inf)); saveRDS(df, file="test_dataframe_float_with_na_nan.rds") # noqa: E501,ERA001 + data = rdata.read_rds(TESTDATA_PATH / "test_dataframe_float_with_na_nan.rds") + + index = pd.RangeIndex(1, 8) + ref = pd.DataFrame( + { + "float": pd.Series( + [1.1, 2.2, 3.3, R_FLOAT_NA, np.nan, np.inf, -np.inf], + dtype=float, index=index), + }, + index=index, + ) + pd.testing.assert_frame_equal(data, ref) + def test_ts(self) -> None: """Test time series conversion.""" data = rdata.read_rda(TESTDATA_PATH / "test_ts.rda") @@ -689,6 +808,7 @@ def test_altrep_wrap_real_attributes(self) -> None: data = rdata.conversion.convert(parsed) np.testing.assert_equal(data, [1., 2., 3.]) + @pytest.mark.filterwarnings("ignore:Missing constructor") def test_altrep_wrap_real_class_attribute(self) -> None: """Test alternative representation of wrap_real with class attribute.""" # File created in R with diff --git a/rdata/tests/test_write.py b/rdata/tests/test_write.py index 3413c6d..e733518 100644 --- a/rdata/tests/test_write.py +++ b/rdata/tests/test_write.py @@ -7,9 +7,12 @@ from pathlib import Path from typing import TYPE_CHECKING, Any +import numpy as np +import pandas as pd import pytest import rdata +from rdata.conversion import ConverterFromPythonToR, convert_python_to_r_object from rdata.unparser import unparse_data if TYPE_CHECKING: @@ -82,7 +85,8 @@ def test_unparse(fname: str) -> None: with (TESTDATA_PATH / fname).open("rb") as f: data = decompress_data(f.read()) file_type, file_format = parse_file_type_and_format(data) - r_data = rdata.parser.parse_data(data, expand_altrep=False) + r_data = rdata.parser.parse_data( + data, expand_altrep=False, extension=f".{file_type}") try: out_data = unparse_data( @@ -96,23 +100,30 @@ def test_unparse(fname: str) -> None: assert data == out_data +@pytest.mark.filterwarnings("ignore:Missing constructor") @pytest.mark.parametrize("fname", fnames, ids=fnames) -def test_convert_to_r(fname: str) -> None: +@pytest.mark.parametrize("expand_altrep", [True, False]) +def test_convert_to_r(fname: str, expand_altrep: bool) -> None: # noqa: FBT001 """Test converting Python data to RData object.""" with (TESTDATA_PATH / fname).open("rb") as f: - # Skip test files without unique R->py->R transformation + # Skip test files without unique transformation if fname in [ - "test_encodings.rda", # encoding not kept in Python - "test_encodings_v3.rda", # encoding not kept in Python - "test_list_attrs.rda", # attributes not kept in Python - "test_file.rda", # attributes not kept in Python + # encoding not kept in Python + "test_encodings.rda", + "test_encodings_v3.rda", + # attributes not kept in Python + "test_list_attrs.rda", + "test_file.rda", + "test_altrep_wrap_real_attributes.rds", + "test_altrep_wrap_real_class_attribute.rds", ]: - pytest.skip("ambiguous R->py->R transformation") + pytest.skip("ambiguous R-to-Python-to-R transformation") data = decompress_data(f.read()) file_type, file_format = parse_file_type_and_format(data) - r_data = rdata.parser.parse_data(data, expand_altrep=False) + r_data = rdata.parser.parse_data( + data, expand_altrep=expand_altrep, extension=f".{file_type}") try: py_data = rdata.conversion.convert(r_data) @@ -126,68 +137,153 @@ def test_convert_to_r(fname: str) -> None: else: encoding = encoding.lower() # type: ignore [assignment] + converter = ConverterFromPythonToR( + encoding=encoding, + format_version=r_data.versions.format, + r_version_serialized=r_data.versions.serialized, + ) + try: - if file_type == "rds": - r_obj = rdata.conversion.convert_to_r_object( - py_data, encoding=encoding) - else: - r_obj = rdata.conversion.convert_to_r_object_for_rda( - py_data, encoding=encoding) - new_r_data = rdata.conversion.build_r_data( - r_obj, - encoding=encoding, - format_version=r_data.versions.format, - r_version_serialized=r_data.versions.serialized, - ) + new_r_data = converter.convert_to_r_data(py_data, file_type=file_type) except NotImplementedError as e: pytest.xfail(str(e)) - assert r_data == new_r_data assert str(r_data) == str(new_r_data) + assert r_data == new_r_data + + # Check further that the resulting unparsed data is correct to ensure that + # Python-to-R conversion hasn't created any odd objects that can't be unparsed + if not expand_altrep: + file_type, file_format = parse_file_type_and_format(data) + out_data = unparse_data( + new_r_data, file_format=file_format, file_type=file_type) + + if file_format == "ascii": + data = data.replace(b"\r\n", b"\n") + + assert data == out_data -def test_convert_to_r_bad_rda() -> None: +def test_convert_to_r_rda_missing_names() -> None: """Test checking that data for RDA has variable names.""" - py_data = "hello" + converter = ConverterFromPythonToR() with pytest.raises(TypeError, match="(?i)data must be a dictionary"): - rdata.conversion.convert_to_r_object_for_rda(py_data) # type: ignore [arg-type] + converter.convert_to_r_data("hello", file_type="rda") + + +def test_convert_to_r_rda_nonstr_names() -> None: + """Test checking that RDA variable names are strings.""" + converter = ConverterFromPythonToR() + with pytest.raises(ValueError, match="(?i)keys must be strings"): + converter.convert_to_r_data({1: "hello"}, file_type="rda") def test_convert_to_r_empty_rda() -> None: """Test checking that data for RDA has variable names.""" py_data: dict[str, Any] = {} + converter = ConverterFromPythonToR() with pytest.raises(ValueError, match="(?i)data must not be empty"): - rdata.conversion.convert_to_r_object_for_rda(py_data) + converter.convert_to_r_data(py_data, file_type="rda") def test_unparse_bad_rda() -> None: """Test checking that data for RDA has variable names.""" py_data = "hello" - r_obj = rdata.conversion.convert_to_r_object(py_data) - r_data = rdata.conversion.build_r_data(r_obj) + converter = ConverterFromPythonToR() + r_data = converter.convert_to_r_data(py_data) with pytest.raises(ValueError, match="(?i)must be dictionary-like"): unparse_data(r_data, file_type="rda") def test_convert_to_r_bad_encoding() -> None: """Test checking encoding.""" + converter = ConverterFromPythonToR(encoding="non-existent") # type: ignore [arg-type] with pytest.raises(LookupError, match="(?i)unknown encoding"): - rdata.conversion.convert_to_r_object("ä", encoding="non-existent") # type: ignore [arg-type] + converter.convert_to_r_object("ä") def test_convert_to_r_unsupported_encoding() -> None: """Test checking encoding.""" + converter = ConverterFromPythonToR(encoding="cp1250") # type: ignore [arg-type] with pytest.raises(ValueError, match="(?i)unsupported encoding"): - rdata.conversion.convert_to_r_object("ä", encoding="cp1250") # type: ignore [arg-type] + converter.convert_to_r_object("ä") + +def test_convert_to_r_nonstr_dict_keys() -> None: + """Test checking non-string dict keys.""" + converter = ConverterFromPythonToR() + with pytest.raises(ValueError, match="(?i)keys must be strings"): + converter.convert_to_r_object({"a": 1, 2: 2}) -def test_unparse_big_int() -> None: + +@pytest.mark.parametrize("file_format", valid_formats) +@pytest.mark.parametrize("value", [-2**31 - 1, 2**31]) +def test_unparse_big_int(file_format: FileFormat, value: int) -> None: """Test checking too large integers.""" - big_int = 2**32 - r_obj = rdata.conversion.convert_to_r_object(big_int) - r_data = rdata.conversion.build_r_data(r_obj) + converter = ConverterFromPythonToR() + r_data = converter.convert_to_r_data(value) with pytest.raises(ValueError, match="(?i)not castable"): - unparse_data(r_data, file_format="xdr") + unparse_data(r_data, file_format=file_format) + + +def test_convert_dataframe_pandas_dtypes() -> None: + """Test converting dataframe with pandas dtypes.""" + df1 = pd.DataFrame( + { + "int": np.array([10, 20, 30], dtype=np.int32), + "float": [1.1, 2.2, 3.3], + "string": ["x" ,"y", "z"], + "bool": [True, False, True], + "complex": [4+5j, 6+7j, 8+9j], + }, + index=range(3), + ) + + index = pd.RangeIndex(3) + df2 = pd.DataFrame( + { + "int": pd.Series([10, 20, 30], dtype=pd.Int32Dtype(), index=index), + "float": pd.Series([1.1, 2.2, 3.3], dtype=pd.Float64Dtype(), index=index), + "string": pd.Series(["x" ,"y", "z"], dtype=pd.StringDtype(), index=index), + "bool": pd.Series([1, 0, 1], dtype=pd.BooleanDtype(), index=index), + "complex": pd.Series([4+5j, 6+7j, 8+9j], dtype=complex, index=index), + }, + index=index, + ) + + r_obj1 = convert_python_to_r_object(df1) + r_obj2 = convert_python_to_r_object(df2) + + assert str(r_obj1) == str(r_obj2) + assert r_obj1 == r_obj2 + + +def test_convert_dataframe_rangeindex() -> None: + """Test converting dataframe with rangeindex.""" + data = {"data": np.array([10, 20, 30], dtype=np.int32)} + + df1 = pd.DataFrame(data, index=pd.RangeIndex(3)) + df2 = pd.DataFrame(data, index=pd.Index([0, 1, 2])) + + r_obj1 = convert_python_to_r_object(df1) + r_obj2 = convert_python_to_r_object(df2) + + assert str(r_obj1) != str(r_obj2) + assert r_obj1 != r_obj2 + + +def test_convert_dataframe_rangeindex_flattened() -> None: + """Test converting dataframe with rangeindex.""" + data = {"data": np.array([10, 20, 30], dtype=np.int32)} + + df1 = pd.DataFrame(data, index=pd.RangeIndex(3, 8, 2)) + df2 = pd.DataFrame(data, index=pd.Index([3, 5, 7])) + + r_obj1 = convert_python_to_r_object(df1) + r_obj2 = convert_python_to_r_object(df2) + + assert str(r_obj1) == str(r_obj2) + assert r_obj1 == r_obj2 @pytest.mark.parametrize("compression", [*valid_compressions, "fail"]) diff --git a/rdata/unparser/__init__.py b/rdata/unparser/__init__.py index 0fdc243..02a189e 100644 --- a/rdata/unparser/__init__.py +++ b/rdata/unparser/__init__.py @@ -25,12 +25,12 @@ def unparse_file( - path: os.PathLike[Any] | str, - r_data: RData, - *, - file_format: FileFormat = "xdr", - file_type: FileType = "rds", - compression: Compression = "gzip", + path: os.PathLike[Any] | str, + r_data: RData, + *, + file_format: FileFormat = "xdr", + file_type: FileType = "rds", + compression: Compression = "gzip", ) -> None: """ Unparse RData object to a file. @@ -59,11 +59,11 @@ def unparse_file( def unparse_fileobj( - fileobj: IO[Any], - r_data: RData, - *, - file_format: FileFormat = "xdr", - file_type: FileType = "rds", + fileobj: IO[Any], + r_data: RData, + *, + file_format: FileFormat = "xdr", + file_type: FileType = "rds", ) -> None: """ Unparse RData object to a file object. @@ -78,9 +78,11 @@ def unparse_fileobj( if file_format == "ascii": from ._ascii import UnparserASCII as Unparser + rda_magic = "RDA" elif file_format == "xdr": from ._xdr import UnparserXDR as Unparser + rda_magic = "RDX" else: msg = f"Unknown file format: {file_format}" @@ -89,9 +91,11 @@ def unparse_fileobj( # Check that RData object for rda file is of correct kind if file_type == "rda": r_object = r_data.object - if not (r_object.info.type is RObjectType.LIST - and r_object.tag is not None - and r_object.tag.info.type is RObjectType.SYM): + if not ( + r_object.info.type is RObjectType.LIST + and r_object.tag is not None + and r_object.tag.info.type is RObjectType.SYM + ): msg = "r_data object must be dictionary-like for rda file" raise ValueError(msg) @@ -104,10 +108,10 @@ def unparse_fileobj( def unparse_data( - r_data: RData, - *, - file_format: FileFormat = "xdr", - file_type: FileType = "rds", + r_data: RData, + *, + file_format: FileFormat = "xdr", + file_type: FileType = "rds", ) -> bytes: """ Unparse RData object to a bytestring. diff --git a/rdata/unparser/_ascii.py b/rdata/unparser/_ascii.py index efb7b66..bab0461 100644 --- a/rdata/unparser/_ascii.py +++ b/rdata/unparser/_ascii.py @@ -7,11 +7,13 @@ import numpy as np +from rdata.missing import is_na + from ._unparser import Unparser if TYPE_CHECKING: import io - from typing import Any, Final + from typing import Final import numpy.typing as npt @@ -33,7 +35,7 @@ def escape(b: bytes) -> str: byte_to_str[byte] = escape(bytes([byte])) # Update mapping for special characters - byte_to_str[b'"'[0]] = r'\"' + byte_to_str[b'"'[0]] = r"\"" byte_to_str[b"'"[0]] = r"\'" byte_to_str[b"?"[0]] = r"\?" byte_to_str[b" "[0]] = r"\040" @@ -66,11 +68,10 @@ def unparse_magic(self) -> None: """Unparse magic bits.""" self._add_line("A") - def _unparse_array_values(self, array: npt.NDArray[Any]) -> None: - # Convert boolean to int - if np.issubdtype(array.dtype, np.bool_): - array = array.astype(np.int32) - + def _unparse_array_values_raw( + self, + array: npt.NDArray[np.int32 | np.float64 | np.complex128], + ) -> None: # Convert complex to pairs of floats if np.issubdtype(array.dtype, np.complexfloating): assert array.dtype == np.complex128 @@ -79,10 +80,12 @@ def _unparse_array_values(self, array: npt.NDArray[Any]) -> None: # Unparse data for value in array: if np.issubdtype(array.dtype, np.integer): - line = "NA" if value is None or np.ma.is_masked(value) else str(value) # type: ignore [no-untyped-call] + line = "NA" if is_na(value) else str(value) elif np.issubdtype(array.dtype, np.floating): - if np.isnan(value): + if is_na(value): + line = "NA" + elif np.isnan(value): line = "NaN" elif value == np.inf: line = "Inf" @@ -90,8 +93,7 @@ def _unparse_array_values(self, array: npt.NDArray[Any]) -> None: line = "-Inf" else: line = str(value) - if line.endswith(".0"): - line = line[:-2] + line = line.removesuffix(".0") else: msg = f"Unknown dtype: {array.dtype}" diff --git a/rdata/unparser/_unparser.py b/rdata/unparser/_unparser.py index b2b073e..b17630e 100644 --- a/rdata/unparser/_unparser.py +++ b/rdata/unparser/_unparser.py @@ -7,6 +7,7 @@ import numpy as np +from rdata.missing import R_INT_NA from rdata.parser import ( RData, RExtraInfo, @@ -69,9 +70,35 @@ def unparse_array(self, array: npt.NDArray[Any]) -> None: self.unparse_int(array.size) self._unparse_array_values(array) - @abc.abstractmethod def _unparse_array_values(self, array: npt.NDArray[Any]) -> None: """Unparse the values of an array.""" + # Convert boolean to int + if np.issubdtype(array.dtype, np.bool_): + array = array.astype(np.int32) + + # Flatten masked values and convert int arrays to int32 + if np.issubdtype(array.dtype, np.integer): + if np.ma.is_masked(array): # type: ignore [no-untyped-call] + mask = np.ma.getmask(array) # type: ignore [no-untyped-call] + array = np.ma.getdata(array).copy() # type: ignore [no-untyped-call] + array[mask] = R_INT_NA + + if array.dtype != np.int32: + info = np.iinfo(np.int32) + if np.any(array > info.max) or np.any(array < info.min): + msg = "Integer array not castable to int32" + raise ValueError(msg) + array = array.astype(np.int32) + + assert array.dtype in (np.int32, np.float64, np.complex128) + self._unparse_array_values_raw(array) + + @abc.abstractmethod + def _unparse_array_values_raw( + self, + array: npt.NDArray[np.int32 | np.float64 | np.complex128], + ) -> None: + """Unparse the values of an array as such.""" def unparse_string(self, value: bytes | None) -> None: """Unparse a string.""" @@ -106,8 +133,9 @@ def unparse_r_object(self, obj: RObject) -> None: # noqa: C901, PLR0912 # Unparse data value = obj.value if info.type in { - RObjectType.NIL, - RObjectType.NILVALUE, + RObjectType.NIL, + RObjectType.NILVALUE, + RObjectType.REF, }: # These types don't have any data assert value is None @@ -118,6 +146,7 @@ def unparse_r_object(self, obj: RObject) -> None: # noqa: C901, PLR0912 elif info.type in { RObjectType.LIST, RObjectType.LANG, + RObjectType.ALTREP, # Parser treats the following equal to LIST. # Not tested if they work # RObjectType.CLO, diff --git a/rdata/unparser/_xdr.py b/rdata/unparser/_xdr.py index 742aa87..255c182 100644 --- a/rdata/unparser/_xdr.py +++ b/rdata/unparser/_xdr.py @@ -2,17 +2,14 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any - -import numpy as np - -from rdata.parser import R_INT_NA +from typing import TYPE_CHECKING from ._unparser import Unparser if TYPE_CHECKING: import io + import numpy as np import numpy.typing as npt @@ -30,23 +27,10 @@ def unparse_magic(self) -> None: """Unparse magic bits.""" self.file.write(b"X\n") - def _unparse_array_values(self, array: npt.NDArray[Any]) -> None: - # Convert boolean to int - if np.issubdtype(array.dtype, np.bool_): - array = array.astype(np.int32) - - # Flatten masked values and convert int arrays to int32 - if np.issubdtype(array.dtype, np.integer): - if np.ma.is_masked(array): # type: ignore [no-untyped-call] - mask = np.ma.getmask(array) # type: ignore [no-untyped-call] - array = np.ma.getdata(array).copy() # type: ignore [no-untyped-call] - array[mask] = R_INT_NA - info = np.iinfo(np.int32) - if not all(info.min <= val <= info.max for val in array): - msg = "Integer array not castable to int32" - raise ValueError(msg) - array = array.astype(np.int32) - + def _unparse_array_values_raw( + self, + array: npt.NDArray[np.int32 | np.float64 | np.complex128], + ) -> None: # Convert to big endian if needed array = array.astype(array.dtype.newbyteorder(">"))