From 3a6664d4d39d7c08ed59e7c591f2a186e012f0c8 Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Thu, 5 Sep 2024 17:13:05 +0300 Subject: [PATCH 001/100] Add reference type to unparser --- rdata/unparser/_unparser.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/rdata/unparser/_unparser.py b/rdata/unparser/_unparser.py index 7361b65..1524317 100644 --- a/rdata/unparser/_unparser.py +++ b/rdata/unparser/_unparser.py @@ -98,8 +98,9 @@ def unparse_r_object(self, obj: RObject) -> None: # noqa: C901, PLR0912 # Unparse data value = obj.value if info.type in { - RObjectType.NIL, - RObjectType.NILVALUE, + RObjectType.NIL, + RObjectType.NILVALUE, + RObjectType.REF, }: # These types don't have any data assert value is None From 153f80368fd2097bcdaf8de6a1e2cffb60938ddc Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Thu, 5 Sep 2024 17:13:53 +0300 Subject: [PATCH 002/100] Add draft dataframe conversion --- rdata/conversion/to_r.py | 107 +++++++++++++++++++++++++++++++++++---- 1 file changed, 98 insertions(+), 9 deletions(-) diff --git a/rdata/conversion/to_r.py b/rdata/conversion/to_r.py index 39df39e..7fc4c86 100644 --- a/rdata/conversion/to_r.py +++ b/rdata/conversion/to_r.py @@ -7,6 +7,7 @@ from typing import TYPE_CHECKING import numpy as np +import pandas as pd from rdata.parser import ( CharFlags, @@ -49,6 +50,23 @@ def __call__(self, data: Any, *, encoding: Encoding) -> RObject: # noqa: ANN401 }) +def find_is_object(attributes: RObject | None): + if attributes is None: + return False + info = attributes.info + if info.type != RObjectType.LIST: + return False + if not info.tag: + return False + tag = attributes.tag + if tag.info.type == RObjectType.REF: + tag = tag.referenced_object + if (tag.info.type == RObjectType.SYM + and tag.value.value == b"class"): + return True + return find_is_object(attributes.value[1]) + + def build_r_object( r_type: RObjectType, *, @@ -56,6 +74,7 @@ def build_r_object( attributes: RObject | None = None, tag: RObject | None = None, gp: int = 0, + reference: int = 0, ) -> RObject: """ Build R object. @@ -66,6 +85,7 @@ def build_r_object( attributes: Same as in RObject. tag: Same as in RObject. gp: Same as in RObjectInfo. + reference: Same as in RObjectInfo. Returns: R object. @@ -75,19 +95,24 @@ def build_r_object( RObjectInfo """ assert r_type is not None + if reference == 0: + assert reference_name_list[reference] is None + else: + assert r_type == RObjectType.REF + is_object = find_is_object(attributes) return RObject( RObjectInfo( r_type, - object=False, + object=is_object, attributes=attributes is not None, tag=tag is not None, gp=gp, - reference=0, + reference=reference, ), value, attributes, tag, - None, + reference_obj_list[reference], ) @@ -138,8 +163,13 @@ def build_r_list( ) +# XXX global lists +reference_name_list = [None] +reference_obj_list = [None] + + def build_r_sym( - data: str, + name: str, *, encoding: Encoding, ) -> RObject: @@ -147,15 +177,26 @@ def build_r_sym( Build R object representing symbol. Args: - data: String. - encoding: Encoding to be used for strings within data. + name: String. + encoding: Encoding to be used for the name. Returns: R object. """ - r_type = RObjectType.SYM - r_value = convert_to_r_object(data.encode(encoding), encoding=encoding) - return build_r_object(r_type, value=r_value) + # Reference to existing symbol if exists + if name in reference_name_list: + # XXX can any symbol be referenced??? + reference = reference_name_list.index(name) + return build_r_object(RObjectType.REF, reference=reference) + + # Create a new symbol + r_value = convert_to_r_object(name.encode(encoding), encoding=encoding) + r_object = build_r_object(RObjectType.SYM, value=r_value) + + # Add to reference list + reference_name_list.append(name) + reference_obj_list.append(r_object) + return r_object def build_r_data( @@ -336,6 +377,54 @@ def convert_to_r_object( # noqa: C901, PLR0912, PLR0915 raise ValueError(msg) r_value = data + elif isinstance(data, pd.Series): + array = data.array + if isinstance(array, pd.Categorical): + return convert_to_r_object(array, encoding=encoding) + elif isinstance(array, pd.arrays.IntegerArray): + return convert_to_r_object(data.to_numpy(), encoding=encoding) + else: + msg = f"pd.Series {type(array)} not implemented" + raise NotImplementedError(msg) + + elif isinstance(data, pd.Categorical): + r_type = RObjectType.INT + r_value = data.codes + 1 + attributes = build_r_list({ + "levels": np.asarray(list(data.categories)), + "class": "factor", + }, + encoding=encoding) + + elif isinstance(data, pd.DataFrame): + r_type = RObjectType.VEC + names = [] + r_value = [] + for column, series in data.items(): + names.append(column) + r_value.append(convert_to_r_object(series, encoding=encoding)) + + index = data.index + if (isinstance(index, pd.RangeIndex) + and index.start == 1 + and index.stop == data.shape[0] + 1 + and index.step == 1 + ): + row_names = np.ma.array( + data=[0, -data.shape[0]], + mask=[True, False], + ) + else: + msg = f"pd.DataFrame index {type(index)} not implemented" + raise NotImplementedError(msg) + + attributes = build_r_list({ + "names": np.asarray(names), + "row.names": row_names, + "class": "data.frame", + }, + encoding=encoding) + else: msg = f"type {type(data)} not implemented" raise NotImplementedError(msg) From 45575593818c56cd099bd5edeec3b46a137d058f Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Thu, 5 Sep 2024 17:51:43 +0300 Subject: [PATCH 003/100] Add helper function for creating unicode arrays --- rdata/conversion/to_r.py | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/rdata/conversion/to_r.py b/rdata/conversion/to_r.py index 7fc4c86..08aef77 100644 --- a/rdata/conversion/to_r.py +++ b/rdata/conversion/to_r.py @@ -25,6 +25,8 @@ ) if TYPE_CHECKING: + import numpy.typing as npt + from collections.abc import Mapping from typing import Any, Final, Literal, Protocol @@ -50,6 +52,21 @@ def __call__(self, data: Any, *, encoding: Encoding) -> RObject: # noqa: ANN401 }) +def create_unicode_array( + names: Any, +) -> npt.NDArray[Any]: + """ + Create unicode array from sequence/iterator of strings. + + Args: + names: Strings. + + Returns: + Array. + """ + return np.array(list(names), dtype=np.dtype("U")) + + def find_is_object(attributes: RObject | None): if attributes is None: return False @@ -311,7 +328,7 @@ def convert_to_r_object( # noqa: C901, PLR0912, PLR0915 r_value = [convert_to_r_object(el, encoding=encoding) for el in values] if isinstance(data, dict): - names = np.array(list(data.keys()), dtype=np.dtype("U")) + names = create_unicode_array(data.keys()) attributes = build_r_list({"names": names}, encoding=encoding) @@ -391,7 +408,7 @@ def convert_to_r_object( # noqa: C901, PLR0912, PLR0915 r_type = RObjectType.INT r_value = data.codes + 1 attributes = build_r_list({ - "levels": np.asarray(list(data.categories)), + "levels": create_unicode_array(data.categories), "class": "factor", }, encoding=encoding) @@ -410,7 +427,7 @@ def convert_to_r_object( # noqa: C901, PLR0912, PLR0915 and index.stop == data.shape[0] + 1 and index.step == 1 ): - row_names = np.ma.array( + row_names = np.ma.array( # type: ignore [no-untyped-call] data=[0, -data.shape[0]], mask=[True, False], ) @@ -419,7 +436,7 @@ def convert_to_r_object( # noqa: C901, PLR0912, PLR0915 raise NotImplementedError(msg) attributes = build_r_list({ - "names": np.asarray(names), + "names": create_unicode_array(names), "row.names": row_names, "class": "data.frame", }, From 6eeb992ac82e1e4d749833a0c344768dc10ce570 Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Thu, 5 Sep 2024 18:17:24 +0300 Subject: [PATCH 004/100] Add more pd.Series types --- rdata/conversion/to_r.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/rdata/conversion/to_r.py b/rdata/conversion/to_r.py index 08aef77..72e4180 100644 --- a/rdata/conversion/to_r.py +++ b/rdata/conversion/to_r.py @@ -398,7 +398,10 @@ def convert_to_r_object( # noqa: C901, PLR0912, PLR0915 array = data.array if isinstance(array, pd.Categorical): return convert_to_r_object(array, encoding=encoding) - elif isinstance(array, pd.arrays.IntegerArray): + elif isinstance(array, pd.arrays.StringArray): + return convert_to_r_object(create_unicode_array(array), encoding=encoding) + elif (isinstance(array, pd.arrays.IntegerArray) + or isinstance(array, pd.arrays.NumpyExtensionArray)): return convert_to_r_object(data.to_numpy(), encoding=encoding) else: msg = f"pd.Series {type(array)} not implemented" From ffddf74edf088b87b4a3ae0fc5139a58d65958fb Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Thu, 5 Sep 2024 18:18:01 +0300 Subject: [PATCH 005/100] Fix the order of symbol references --- rdata/conversion/to_r.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rdata/conversion/to_r.py b/rdata/conversion/to_r.py index 72e4180..dd17b3a 100644 --- a/rdata/conversion/to_r.py +++ b/rdata/conversion/to_r.py @@ -161,8 +161,8 @@ def build_r_list( if isinstance(data, dict): data = data.copy() key = next(iter(data)) - value1 = convert_value(data.pop(key), encoding=encoding) tag = build_r_sym(key, encoding=encoding) + value1 = convert_value(data.pop(key), encoding=encoding) elif isinstance(data, list): value1 = convert_value(data[0], encoding=encoding) data = data[1:] From eb82ff657dc6f6ea41b2bbc7051f105776878468 Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Tue, 10 Sep 2024 09:10:28 +0300 Subject: [PATCH 006/100] Add a converter class for Python-to-R conversion --- rdata/_write.py | 12 +- rdata/conversion/__init__.py | 3 +- rdata/conversion/to_r.py | 529 +++++++++++++++++------------------ rdata/tests/test_write.py | 32 ++- 4 files changed, 282 insertions(+), 294 deletions(-) diff --git a/rdata/_write.py b/rdata/_write.py index 39a8255..0630d1e 100644 --- a/rdata/_write.py +++ b/rdata/_write.py @@ -3,7 +3,7 @@ from typing import TYPE_CHECKING -from .conversion import build_r_data, convert_to_r_object, convert_to_r_object_for_rda +from .conversion import build_r_data, ConverterFromPythonToR from .conversion.to_r import DEFAULT_FORMAT_VERSION from .unparser import unparse_file @@ -52,10 +52,7 @@ def write_rds( >>> data = ["hello", 1, 2.2, 3.3+4.4j] >>> rdata.write_rds("test.rds", data) """ - r_object = convert_to_r_object( - data, - encoding=encoding, - ) + r_object = ConverterFromPythonToR(encoding=encoding).convert_to_r_object(data) r_data = build_r_data( r_object, encoding=encoding, @@ -107,10 +104,7 @@ def write_rda( >>> data = {"name": "hello", "values": [1, 2.2, 3.3+4.4j]} >>> rdata.write_rda("test.rda", data) """ - r_object = convert_to_r_object_for_rda( - data, - encoding=encoding, - ) + r_object = ConverterFromPythonToR(encoding=encoding).convert_to_r_object_for_rda(data) r_data = build_r_data( r_object, encoding=encoding, diff --git a/rdata/conversion/__init__.py b/rdata/conversion/__init__.py index 44f6ad7..2ec4f44 100644 --- a/rdata/conversion/__init__.py +++ b/rdata/conversion/__init__.py @@ -26,6 +26,5 @@ ) from .to_r import ( build_r_data as build_r_data, - convert_to_r_object as convert_to_r_object, - convert_to_r_object_for_rda as convert_to_r_object_for_rda, + ConverterFromPythonToR as ConverterFromPythonToR, ) diff --git a/rdata/conversion/to_r.py b/rdata/conversion/to_r.py index dd17b3a..918b89b 100644 --- a/rdata/conversion/to_r.py +++ b/rdata/conversion/to_r.py @@ -91,7 +91,7 @@ def build_r_object( attributes: RObject | None = None, tag: RObject | None = None, gp: int = 0, - reference: int = 0, + reference: tuple(int, RObject | None) = (0, None), ) -> RObject: """ Build R object. @@ -102,7 +102,7 @@ def build_r_object( attributes: Same as in RObject. tag: Same as in RObject. gp: Same as in RObjectInfo. - reference: Same as in RObjectInfo. + reference: Tuple of integer and object. Returns: R object. @@ -112,10 +112,8 @@ def build_r_object( RObjectInfo """ assert r_type is not None - if reference == 0: - assert reference_name_list[reference] is None - else: - assert r_type == RObjectType.REF + reference_id, referenced_object = reference + assert (reference_id == 0) == (referenced_object == None) == (r_type != RObjectType.REF) is_object = find_is_object(attributes) return RObject( RObjectInfo( @@ -124,98 +122,15 @@ def build_r_object( attributes=attributes is not None, tag=tag is not None, gp=gp, - reference=reference, + reference=reference_id, ), value, attributes, tag, - reference_obj_list[reference], + referenced_object, ) -def build_r_list( - data: Mapping[str, Any] | list[Any], - *, - encoding: Encoding, - convert_value: Converter | None = None, -) -> RObject: - """ - Build R object representing named linked list. - - Args: - data: Non-empty dictionary or list. - encoding: Encoding to be used for strings within data. - convert_value: Function used for converting value to R object - (for example, convert_to_r_object). - - Returns: - R object. - """ - if convert_value is None: - convert_value = convert_to_r_object - - if len(data) == 0: - msg = "data must not be empty" - raise ValueError(msg) - - if isinstance(data, dict): - data = data.copy() - key = next(iter(data)) - tag = build_r_sym(key, encoding=encoding) - value1 = convert_value(data.pop(key), encoding=encoding) - elif isinstance(data, list): - value1 = convert_value(data[0], encoding=encoding) - data = data[1:] - tag = None - - if len(data) == 0: - value2 = build_r_object(RObjectType.NILVALUE) - else: - value2 = build_r_list(data, encoding=encoding, convert_value=convert_value) - - return build_r_object( - RObjectType.LIST, - value=(value1, value2), - tag=tag, - ) - - -# XXX global lists -reference_name_list = [None] -reference_obj_list = [None] - - -def build_r_sym( - name: str, - *, - encoding: Encoding, -) -> RObject: - """ - Build R object representing symbol. - - Args: - name: String. - encoding: Encoding to be used for the name. - - Returns: - R object. - """ - # Reference to existing symbol if exists - if name in reference_name_list: - # XXX can any symbol be referenced??? - reference = reference_name_list.index(name) - return build_r_object(RObjectType.REF, reference=reference) - - # Create a new symbol - r_value = convert_to_r_object(name.encode(encoding), encoding=encoding) - r_object = build_r_object(RObjectType.SYM, value=r_value) - - # Add to reference list - reference_name_list.append(name) - reference_obj_list.append(r_object) - return r_object - - def build_r_data( r_object: RObject, *, @@ -228,7 +143,7 @@ def build_r_data( Args: r_object: R object. - encoding: Encoding to be used for strings within data. + encoding: Encoding saved in the metadata. format_version: File format version. r_version_serialized: R version written as the creator of the object. @@ -252,201 +167,275 @@ def build_r_data( return RData(versions, extra, r_object) -def convert_to_r_object_for_rda( - data: Mapping[str, Any], - *, - encoding: Encoding = "utf-8", -) -> RObject: +class ConverterFromPythonToR: """ - Convert Python dictionary to R object for RDA file. + Class converting Python objects to R objects. Args: - data: Python dictionary with data and variable names. encoding: Encoding to be used for strings within data. - - Returns: - Corresponding R object. - - See Also: - convert_to_r_object """ - if not isinstance(data, dict): - msg = f"for RDA file, data must be a dictionary, not type {type(data)}" - raise TypeError(msg) - return build_r_list(data, encoding=encoding) + def __init__(self, *, encoding: Encoding = "utf-8"): + self.encoding = encoding + self.reference_name_list = [None] + self.reference_obj_list = [None] + + + def build_r_list(self, + data: Mapping[str, Any] | list[Any], + *, + convert_value: Converter | None = None, + ) -> RObject: + """ + Build R object representing named linked list. + + Args: + data: Non-empty dictionary or list. + convert_value: Function used for converting value to R object + (for example, convert_to_r_object). + + Returns: + R object. + """ + if convert_value is None: + convert_value = self.convert_to_r_object + + if len(data) == 0: + msg = "data must not be empty" + raise ValueError(msg) -def convert_to_r_object( # noqa: C901, PLR0912, PLR0915 - data: Any, # noqa: ANN401 - *, - encoding: Encoding = "utf-8", -) -> RObject: - """ - Convert Python data to R object. - - Args: - data: Any Python object. - encoding: Encoding to be used for strings within data. + if isinstance(data, dict): + data = data.copy() + key = next(iter(data)) + tag = self.build_r_sym(key) + value1 = convert_value(data.pop(key)) + elif isinstance(data, list): + value1 = convert_value(data[0]) + data = data[1:] + tag = None + + if len(data) == 0: + value2 = build_r_object(RObjectType.NILVALUE) + else: + value2 = self.build_r_list(data, convert_value=convert_value) + + return build_r_object( + RObjectType.LIST, + value=(value1, value2), + tag=tag, + ) + + + def build_r_sym(self, + name: str, + ) -> RObject: + """ + Build R object representing symbol. + + Args: + name: String. + + Returns: + R object. + """ + # Reference to existing symbol if exists + if name in self.reference_name_list: + idx = self.reference_name_list.index(name) + obj = self.reference_obj_list[idx] + return build_r_object(RObjectType.REF, reference=(idx, obj)) + + # Create a new symbol + r_value = self.convert_to_r_object(name.encode(self.encoding)) + r_object = build_r_object(RObjectType.SYM, value=r_value) + + # Add to reference list + self.reference_name_list.append(name) + self.reference_obj_list.append(r_object) + return r_object + + + def convert_to_r_object_for_rda(self, + data: Mapping[str, Any], + ) -> RObject: + """ + Convert Python dictionary to R object for RDA file. + + Args: + data: Python dictionary with data and variable names. + + Returns: + Corresponding R object. + + See Also: + convert_to_r_object + """ + if not isinstance(data, dict): + msg = f"for RDA file, data must be a dictionary, not type {type(data)}" + raise TypeError(msg) + return self.build_r_list(data) + + + def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 + data: Any, # noqa: ANN401 + ) -> RObject: + """ + Convert Python data to R object. + + Args: + data: Any Python object. + + Returns: + Corresponding R object. + """ + # Default args for most types (None/False/0) + r_type = None + values: list[Any] | tuple[Any, ...] + r_value: Any = None + gp = 0 + attributes = None + tag = None - Returns: - Corresponding R object. + if data is None: + r_type = RObjectType.NILVALUE - See Also: - convert_to_r_data - """ - # Default args for most types (None/False/0) - r_type = None - values: list[Any] | tuple[Any, ...] - r_value: Any = None - gp = 0 - attributes = None - tag = None - - if data is None: - r_type = RObjectType.NILVALUE - - elif isinstance(data, RExpression): - r_type = RObjectType.EXPR - r_value = [convert_to_r_object(el, encoding=encoding) for el in data.elements] - - elif isinstance(data, RLanguage): - r_type = RObjectType.LANG - values = data.elements - r_value = (build_r_sym(str(values[0]), encoding=encoding), - build_r_list(values[1:], encoding=encoding, - convert_value=build_r_sym)) - - if len(data.attributes) > 0: - # The following might work here (untested) - # attributes = build_r_list(data.attributes, encoding=encoding) # noqa: ERA001,E501 - msg = f"type {r_type} with attributes not implemented" - raise NotImplementedError(msg) + elif isinstance(data, RExpression): + r_type = RObjectType.EXPR + r_value = [self.convert_to_r_object(el) for el in data.elements] - elif isinstance(data, (list, tuple, dict)): - r_type = RObjectType.VEC - values = list(data.values()) if isinstance(data, dict) else data - r_value = [convert_to_r_object(el, encoding=encoding) for el in values] + elif isinstance(data, RLanguage): + r_type = RObjectType.LANG + values = data.elements + r_value = (self.build_r_sym(str(values[0])), + self.build_r_list(values[1:], + convert_value=self.build_r_sym)) - if isinstance(data, dict): - names = create_unicode_array(data.keys()) - attributes = build_r_list({"names": names}, - encoding=encoding) - - elif isinstance(data, np.ndarray): - if data.dtype.kind in ["O"]: - # This is a special case handling only np.array([None]) - if data.size != 1 or data[0] is not None: - msg = "general object array not implemented" + if len(data.attributes) > 0: + # The following might work here (untested) + # attributes = build_r_list(data.attributes) # noqa: ERA001,E501 + msg = f"type {r_type} with attributes not implemented" raise NotImplementedError(msg) - r_type = RObjectType.STR - r_value = [build_r_object(RObjectType.CHAR)] - elif data.dtype.kind in ["S"]: - assert data.ndim == 1 - r_type = RObjectType.STR - r_value = [convert_to_r_object(el, encoding=encoding) for el in data] + elif isinstance(data, (list, tuple, dict)): + r_type = RObjectType.VEC + values = list(data.values()) if isinstance(data, dict) else data + r_value = [self.convert_to_r_object(el) for el in values] + + if isinstance(data, dict): + names = create_unicode_array(data.keys()) + attributes = self.build_r_list({"names": names}) + + elif isinstance(data, np.ndarray): + if data.dtype.kind in ["O"]: + # This is a special case handling only np.array([None]) + if data.size != 1 or data[0] is not None: + msg = "general object array not implemented" + raise NotImplementedError(msg) + r_type = RObjectType.STR + r_value = [build_r_object(RObjectType.CHAR)] + + elif data.dtype.kind in ["S"]: + assert data.ndim == 1 + r_type = RObjectType.STR + r_value = [self.convert_to_r_object(el) for el in data] + + elif data.dtype.kind in ["U"]: + assert data.ndim == 1 + data = np.array([s.encode(self.encoding) for s in data], dtype=np.dtype("S")) + return self.convert_to_r_object(data) - elif data.dtype.kind in ["U"]: - assert data.ndim == 1 - data = np.array([s.encode(encoding) for s in data], dtype=np.dtype("S")) - return convert_to_r_object(data, encoding=encoding) + else: + r_type = { + "b": RObjectType.LGL, + "i": RObjectType.INT, + "f": RObjectType.REAL, + "c": RObjectType.CPLX, + }[data.dtype.kind] + + if data.ndim == 0: + r_value = data[np.newaxis] + elif data.ndim == 1: + r_value = data + else: + # R uses column-major order like Fortran + r_value = np.ravel(data, order="F") + attributes = self.build_r_list({"dim": np.array(data.shape)}) + + elif isinstance(data, (bool, int, float, complex)): + return self.convert_to_r_object(np.array(data)) + + elif isinstance(data, str): + r_type = RObjectType.STR + r_value = [self.convert_to_r_object(data.encode(self.encoding))] + + elif isinstance(data, bytes): + r_type = RObjectType.CHAR + if all(chr(byte) in string.printable for byte in data): + gp = CharFlags.ASCII + elif self.encoding == "utf-8": + gp = CharFlags.UTF8 + elif self.encoding == "cp1252": + # Note! + # CP1252 and Latin1 are not the same. + # Does CharFlags.LATIN1 mean actually CP1252 + # as R on Windows mentions CP1252 as encoding? + # Or does CP1252 change to e.g. CP1250 depending on localization? + gp = CharFlags.LATIN1 + else: + msg = f"unsupported encoding: {self.encoding}" + raise ValueError(msg) + r_value = data + + elif isinstance(data, pd.Series): + array = data.array + if isinstance(array, pd.Categorical): + return self.convert_to_r_object(array) + elif isinstance(array, pd.arrays.StringArray): + return self.convert_to_r_object(create_unicode_array(array)) + elif (isinstance(array, pd.arrays.IntegerArray) + or isinstance(array, pd.arrays.NumpyExtensionArray)): + return self.convert_to_r_object(data.to_numpy()) + else: + msg = f"pd.Series {type(array)} not implemented" + raise NotImplementedError(msg) - else: - r_type = { - "b": RObjectType.LGL, - "i": RObjectType.INT, - "f": RObjectType.REAL, - "c": RObjectType.CPLX, - }[data.dtype.kind] - - if data.ndim == 0: - r_value = data[np.newaxis] - elif data.ndim == 1: - r_value = data + elif isinstance(data, pd.Categorical): + r_type = RObjectType.INT + r_value = data.codes + 1 + attributes = self.build_r_list({ + "levels": create_unicode_array(data.categories), + "class": "factor", + }) + + elif isinstance(data, pd.DataFrame): + r_type = RObjectType.VEC + names = [] + r_value = [] + for column, series in data.items(): + names.append(column) + r_value.append(self.convert_to_r_object(series)) + + index = data.index + if (isinstance(index, pd.RangeIndex) + and index.start == 1 + and index.stop == data.shape[0] + 1 + and index.step == 1 + ): + row_names = np.ma.array( # type: ignore [no-untyped-call] + data=[0, -data.shape[0]], + mask=[True, False], + ) else: - # R uses column-major order like Fortran - r_value = np.ravel(data, order="F") - attributes = build_r_list({"dim": np.array(data.shape)}, - encoding=encoding) - - elif isinstance(data, (bool, int, float, complex)): - return convert_to_r_object(np.array(data), encoding=encoding) - - elif isinstance(data, str): - r_type = RObjectType.STR - r_value = [convert_to_r_object(data.encode(encoding), encoding=encoding)] - - elif isinstance(data, bytes): - r_type = RObjectType.CHAR - if all(chr(byte) in string.printable for byte in data): - gp = CharFlags.ASCII - elif encoding == "utf-8": - gp = CharFlags.UTF8 - elif encoding == "cp1252": - # Note! - # CP1252 and Latin1 are not the same. - # Does CharFlags.LATIN1 mean actually CP1252 - # as R on Windows mentions CP1252 as encoding? - # Or does CP1252 change to e.g. CP1250 depending on localization? - gp = CharFlags.LATIN1 - else: - msg = f"unsupported encoding: {encoding}" - raise ValueError(msg) - r_value = data - - elif isinstance(data, pd.Series): - array = data.array - if isinstance(array, pd.Categorical): - return convert_to_r_object(array, encoding=encoding) - elif isinstance(array, pd.arrays.StringArray): - return convert_to_r_object(create_unicode_array(array), encoding=encoding) - elif (isinstance(array, pd.arrays.IntegerArray) - or isinstance(array, pd.arrays.NumpyExtensionArray)): - return convert_to_r_object(data.to_numpy(), encoding=encoding) - else: - msg = f"pd.Series {type(array)} not implemented" - raise NotImplementedError(msg) + msg = f"pd.DataFrame index {type(index)} not implemented" + raise NotImplementedError(msg) + + attributes = self.build_r_list({ + "names": create_unicode_array(names), + "row.names": row_names, + "class": "data.frame", + }) - elif isinstance(data, pd.Categorical): - r_type = RObjectType.INT - r_value = data.codes + 1 - attributes = build_r_list({ - "levels": create_unicode_array(data.categories), - "class": "factor", - }, - encoding=encoding) - - elif isinstance(data, pd.DataFrame): - r_type = RObjectType.VEC - names = [] - r_value = [] - for column, series in data.items(): - names.append(column) - r_value.append(convert_to_r_object(series, encoding=encoding)) - - index = data.index - if (isinstance(index, pd.RangeIndex) - and index.start == 1 - and index.stop == data.shape[0] + 1 - and index.step == 1 - ): - row_names = np.ma.array( # type: ignore [no-untyped-call] - data=[0, -data.shape[0]], - mask=[True, False], - ) else: - msg = f"pd.DataFrame index {type(index)} not implemented" + msg = f"type {type(data)} not implemented" raise NotImplementedError(msg) - attributes = build_r_list({ - "names": create_unicode_array(names), - "row.names": row_names, - "class": "data.frame", - }, - encoding=encoding) - - else: - msg = f"type {type(data)} not implemented" - raise NotImplementedError(msg) - - return build_r_object(r_type, value=r_value, attributes=attributes, tag=tag, gp=gp) + return build_r_object(r_type, value=r_value, attributes=attributes, tag=tag, gp=gp) diff --git a/rdata/tests/test_write.py b/rdata/tests/test_write.py index 3413c6d..57c358a 100644 --- a/rdata/tests/test_write.py +++ b/rdata/tests/test_write.py @@ -10,6 +10,7 @@ import pytest import rdata +from rdata.conversion import ConverterFromPythonToR, build_r_data from rdata.unparser import unparse_data if TYPE_CHECKING: @@ -127,13 +128,12 @@ def test_convert_to_r(fname: str) -> None: encoding = encoding.lower() # type: ignore [assignment] try: + converter = ConverterFromPythonToR(encoding=encoding) if file_type == "rds": - r_obj = rdata.conversion.convert_to_r_object( - py_data, encoding=encoding) + r_obj = converter.convert_to_r_object(py_data) else: - r_obj = rdata.conversion.convert_to_r_object_for_rda( - py_data, encoding=encoding) - new_r_data = rdata.conversion.build_r_data( + r_obj = converter.convert_to_r_object_for_rda(py_data) + new_r_data = build_r_data( r_obj, encoding=encoding, format_version=r_data.versions.format, @@ -150,21 +150,24 @@ def test_convert_to_r_bad_rda() -> None: """Test checking that data for RDA has variable names.""" py_data = "hello" with pytest.raises(TypeError, match="(?i)data must be a dictionary"): - rdata.conversion.convert_to_r_object_for_rda(py_data) # type: ignore [arg-type] + converter = ConverterFromPythonToR() + converter.convert_to_r_object_for_rda(py_data) # type: ignore [arg-type] def test_convert_to_r_empty_rda() -> None: """Test checking that data for RDA has variable names.""" py_data: dict[str, Any] = {} with pytest.raises(ValueError, match="(?i)data must not be empty"): - rdata.conversion.convert_to_r_object_for_rda(py_data) + converter = ConverterFromPythonToR() + converter.convert_to_r_object_for_rda(py_data) def test_unparse_bad_rda() -> None: """Test checking that data for RDA has variable names.""" py_data = "hello" - r_obj = rdata.conversion.convert_to_r_object(py_data) - r_data = rdata.conversion.build_r_data(r_obj) + converter = ConverterFromPythonToR() + r_obj = converter.convert_to_r_object(py_data) + r_data = build_r_data(r_obj) with pytest.raises(ValueError, match="(?i)must be dictionary-like"): unparse_data(r_data, file_type="rda") @@ -172,20 +175,23 @@ def test_unparse_bad_rda() -> None: def test_convert_to_r_bad_encoding() -> None: """Test checking encoding.""" with pytest.raises(LookupError, match="(?i)unknown encoding"): - rdata.conversion.convert_to_r_object("ä", encoding="non-existent") # type: ignore [arg-type] + converter = ConverterFromPythonToR(encoding="non-existent") + converter.convert_to_r_object("ä") # type: ignore [arg-type] def test_convert_to_r_unsupported_encoding() -> None: """Test checking encoding.""" with pytest.raises(ValueError, match="(?i)unsupported encoding"): - rdata.conversion.convert_to_r_object("ä", encoding="cp1250") # type: ignore [arg-type] + converter = ConverterFromPythonToR(encoding="cp1250") + converter.convert_to_r_object("ä") # type: ignore [arg-type] def test_unparse_big_int() -> None: """Test checking too large integers.""" big_int = 2**32 - r_obj = rdata.conversion.convert_to_r_object(big_int) - r_data = rdata.conversion.build_r_data(r_obj) + converter = ConverterFromPythonToR() + r_obj = converter.convert_to_r_object(big_int) + r_data = build_r_data(r_obj) with pytest.raises(ValueError, match="(?i)not castable"): unparse_data(r_data, file_format="xdr") From 1868d8acffb2dab8a7cbe1d0efed4baecefef4a6 Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Tue, 10 Sep 2024 09:12:01 +0300 Subject: [PATCH 007/100] Fix masked values in masked array --- rdata/conversion/to_r.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/rdata/conversion/to_r.py b/rdata/conversion/to_r.py index 918b89b..3f68fac 100644 --- a/rdata/conversion/to_r.py +++ b/rdata/conversion/to_r.py @@ -10,6 +10,7 @@ import pandas as pd from rdata.parser import ( + R_INT_NA, CharFlags, RData, RExtraInfo, @@ -421,8 +422,9 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 and index.step == 1 ): row_names = np.ma.array( # type: ignore [no-untyped-call] - data=[0, -data.shape[0]], + data=[R_INT_NA, -data.shape[0]], mask=[True, False], + fill_value=R_INT_NA, ) else: msg = f"pd.DataFrame index {type(index)} not implemented" From 8d9cb55b96d9e60b2bab100741c24c15f8357405 Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Tue, 10 Sep 2024 09:12:40 +0300 Subject: [PATCH 008/100] Compare first string representations --- rdata/tests/test_write.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rdata/tests/test_write.py b/rdata/tests/test_write.py index 57c358a..a46d8db 100644 --- a/rdata/tests/test_write.py +++ b/rdata/tests/test_write.py @@ -142,8 +142,8 @@ def test_convert_to_r(fname: str) -> None: except NotImplementedError as e: pytest.xfail(str(e)) - assert r_data == new_r_data assert str(r_data) == str(new_r_data) + assert r_data == new_r_data def test_convert_to_r_bad_rda() -> None: From 398d1e9baf6dfabc2117b0068f7c71d89c747950 Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Tue, 10 Sep 2024 09:39:32 +0300 Subject: [PATCH 009/100] Fix conversion of dataframe columns --- rdata/conversion/to_r.py | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/rdata/conversion/to_r.py b/rdata/conversion/to_r.py index 3f68fac..6225d1b 100644 --- a/rdata/conversion/to_r.py +++ b/rdata/conversion/to_r.py @@ -387,17 +387,8 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 r_value = data elif isinstance(data, pd.Series): - array = data.array - if isinstance(array, pd.Categorical): - return self.convert_to_r_object(array) - elif isinstance(array, pd.arrays.StringArray): - return self.convert_to_r_object(create_unicode_array(array)) - elif (isinstance(array, pd.arrays.IntegerArray) - or isinstance(array, pd.arrays.NumpyExtensionArray)): - return self.convert_to_r_object(data.to_numpy()) - else: - msg = f"pd.Series {type(array)} not implemented" - raise NotImplementedError(msg) + msg = f"pd.Series not implemented" + raise NotImplementedError(msg) elif isinstance(data, pd.Categorical): r_type = RObjectType.INT @@ -413,7 +404,20 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 r_value = [] for column, series in data.items(): names.append(column) - r_value.append(self.convert_to_r_object(series)) + + array = series.array + if isinstance(array, pd.Categorical): + r_series = self.convert_to_r_object(array) + elif isinstance(array, pd.arrays.StringArray): + r_series = self.convert_to_r_object(create_unicode_array(array)) + elif (isinstance(array, pd.arrays.IntegerArray) + or isinstance(array, pd.arrays.NumpyExtensionArray)): + r_series = self.convert_to_r_object(array.to_numpy()) + else: + msg = f"pd.DataFrame with pd.Series {type(array)} not implemented" + raise NotImplementedError(msg) + + r_value.append(r_series) index = data.index if (isinstance(index, pd.RangeIndex) From 9cdd37c27d6d4c93e5db30d5b24cef194586192d Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Tue, 10 Sep 2024 14:03:42 +0300 Subject: [PATCH 010/100] Add support for dataframe with string index --- rdata/conversion/to_r.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/rdata/conversion/to_r.py b/rdata/conversion/to_r.py index 6225d1b..5a959f0 100644 --- a/rdata/conversion/to_r.py +++ b/rdata/conversion/to_r.py @@ -420,6 +420,7 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 r_value.append(r_series) index = data.index + attr_order = ["names", "row.names", "class"] if (isinstance(index, pd.RangeIndex) and index.start == 1 and index.stop == data.shape[0] + 1 @@ -430,15 +431,24 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 mask=[True, False], fill_value=R_INT_NA, ) + elif isinstance(index, pd.Index): + attr_order = ["names", "class", "row.names"] + if index.dtype == 'object': + row_names = create_unicode_array(index) + else: + msg = f"pd.DataFrame pd.Index {index.dtype} not implemented" + raise NotImplementedError(msg) else: msg = f"pd.DataFrame index {type(index)} not implemented" raise NotImplementedError(msg) - attributes = self.build_r_list({ + attr_dict = { "names": create_unicode_array(names), "row.names": row_names, "class": "data.frame", - }) + } + + attributes = self.build_r_list({k: attr_dict[k] for k in attr_order}) else: msg = f"type {type(data)} not implemented" From 5084d2d2bdbaef131f8abbef997f4579e436c8ee Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Thu, 12 Sep 2024 07:44:32 +0300 Subject: [PATCH 011/100] Add assertions for strings --- rdata/conversion/to_r.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/rdata/conversion/to_r.py b/rdata/conversion/to_r.py index 5a959f0..6e12f6d 100644 --- a/rdata/conversion/to_r.py +++ b/rdata/conversion/to_r.py @@ -65,7 +65,11 @@ def create_unicode_array( Returns: Array. """ - return np.array(list(names), dtype=np.dtype("U")) + name_list = [] + for name in names: + assert isinstance(name, str) + name_list.append(name) + return np.array(name_list, dtype=np.dtype("U")) def find_is_object(attributes: RObject | None): @@ -403,6 +407,7 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 names = [] r_value = [] for column, series in data.items(): + assert isinstance(column, str) names.append(column) array = series.array From af0f6fe31837d58d12be407e4fb7316a19a5bbab Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Thu, 12 Sep 2024 07:46:28 +0300 Subject: [PATCH 012/100] Add conversion for rangeindex and range --- rdata/conversion/to_r.py | 54 +++++++++++++++++++++++++++++----------- 1 file changed, 40 insertions(+), 14 deletions(-) diff --git a/rdata/conversion/to_r.py b/rdata/conversion/to_r.py index 6e12f6d..fc40503 100644 --- a/rdata/conversion/to_r.py +++ b/rdata/conversion/to_r.py @@ -213,20 +213,23 @@ def build_r_list(self, data = data.copy() key = next(iter(data)) tag = self.build_r_sym(key) - value1 = convert_value(data.pop(key)) + car = data.pop(key) elif isinstance(data, list): - value1 = convert_value(data[0]) + car = data[0] data = data[1:] tag = None + if not isinstance(car, RObject): + car = convert_value(car) + if len(data) == 0: - value2 = build_r_object(RObjectType.NILVALUE) + cdr = build_r_object(RObjectType.NILVALUE) else: - value2 = self.build_r_list(data, convert_value=convert_value) + cdr = self.build_r_list(data, convert_value=convert_value) return build_r_object( RObjectType.LIST, - value=(value1, value2), + value=(car, cdr), tag=tag, ) @@ -390,6 +393,27 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 raise ValueError(msg) r_value = data + elif isinstance(data, range): + if data.step != 1: + # R supports compact sequences only with step 1; + # convert the range to an array of values + return self.convert_to_r_object(np.array(data)) + + r_type = RObjectType.ALTREP + r_value = ( + self.build_r_list([ + self.build_r_sym("compact_intseq"), + self.build_r_sym("base"), + RObjectType.INT.value, + ]), + self.convert_to_r_object(np.array([ + len(data), + data.start, + data.step, + ], dtype=float)), + self.convert_to_r_object(None), + ) + elif isinstance(data, pd.Series): msg = f"pd.Series not implemented" raise NotImplementedError(msg) @@ -426,16 +450,18 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 index = data.index attr_order = ["names", "row.names", "class"] - if (isinstance(index, pd.RangeIndex) - and index.start == 1 - and index.stop == data.shape[0] + 1 - and index.step == 1 + if isinstance(index, pd.RangeIndex): + if (index.start == 1 + and index.stop == data.shape[0] + 1 + and index.step == 1 ): - row_names = np.ma.array( # type: ignore [no-untyped-call] - data=[R_INT_NA, -data.shape[0]], - mask=[True, False], - fill_value=R_INT_NA, - ) + row_names = np.ma.array( # type: ignore [no-untyped-call] + data=[R_INT_NA, -data.shape[0]], + mask=[True, False], + fill_value=R_INT_NA, + ) + else: + row_names = range(index.start, index.stop, index.step) elif isinstance(index, pd.Index): attr_order = ["names", "class", "row.names"] if index.dtype == 'object': From 1c71a866a73cafe3c7ee57416245eeb2adf4bd13 Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Thu, 12 Sep 2024 07:47:00 +0300 Subject: [PATCH 013/100] Add conversion of integer index --- rdata/conversion/to_r.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/rdata/conversion/to_r.py b/rdata/conversion/to_r.py index fc40503..4743e9f 100644 --- a/rdata/conversion/to_r.py +++ b/rdata/conversion/to_r.py @@ -466,6 +466,8 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 attr_order = ["names", "class", "row.names"] if index.dtype == 'object': row_names = create_unicode_array(index) + elif np.issubdtype(index.dtype, np.integer): + row_names = index.to_numpy() else: msg = f"pd.DataFrame pd.Index {index.dtype} not implemented" raise NotImplementedError(msg) From 8fa951e71db2ddb8ddd740a1d8b5b7551c30cf6f Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Thu, 12 Sep 2024 07:47:24 +0300 Subject: [PATCH 014/100] Add unparsing altreps --- rdata/unparser/_unparser.py | 1 + 1 file changed, 1 insertion(+) diff --git a/rdata/unparser/_unparser.py b/rdata/unparser/_unparser.py index 1524317..7dd6243 100644 --- a/rdata/unparser/_unparser.py +++ b/rdata/unparser/_unparser.py @@ -111,6 +111,7 @@ def unparse_r_object(self, obj: RObject) -> None: # noqa: C901, PLR0912 elif info.type in { RObjectType.LIST, RObjectType.LANG, + RObjectType.ALTREP, # Parser treats the following equal to LIST. # Not tested if they work # RObjectType.CLO, From b205d8d7da730a876bfb0a30f90d76a866b69189 Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Thu, 12 Sep 2024 08:08:09 +0300 Subject: [PATCH 015/100] Move build_r_data function under converter class --- rdata/_write.py | 26 ++++++------- rdata/conversion/__init__.py | 1 - rdata/conversion/to_r.py | 72 ++++++++++++++++++------------------ rdata/tests/test_write.py | 19 +++++----- 4 files changed, 57 insertions(+), 61 deletions(-) diff --git a/rdata/_write.py b/rdata/_write.py index 0630d1e..a1fd162 100644 --- a/rdata/_write.py +++ b/rdata/_write.py @@ -3,7 +3,7 @@ from typing import TYPE_CHECKING -from .conversion import build_r_data, ConverterFromPythonToR +from .conversion import ConverterFromPythonToR from .conversion.to_r import DEFAULT_FORMAT_VERSION from .unparser import unparse_file @@ -27,10 +27,7 @@ def write_rds( """ Write an RDS file. - This is a convenience function that wraps - :func:`rdata.conversion.convert_to_r_object`, - :func:`rdata.conversion.build_r_data`, - and :func:`rdata.unparser.unparse_file`, + This is a convenience function that wraps conversion and unparsing as it is the common use case. Args: @@ -52,12 +49,13 @@ def write_rds( >>> data = ["hello", 1, 2.2, 3.3+4.4j] >>> rdata.write_rds("test.rds", data) """ - r_object = ConverterFromPythonToR(encoding=encoding).convert_to_r_object(data) - r_data = build_r_data( - r_object, + converter = ConverterFromPythonToR( encoding=encoding, format_version=format_version, ) + r_object = converter.convert_to_r_object(data) + r_data = converter.build_r_data(r_object) + unparse_file( path, r_data, @@ -79,10 +77,7 @@ def write_rda( """ Write an RDA or RDATA file. - This is a convenience function that wraps - :func:`rdata.conversion.convert_to_r_object_for_rda`, - :func:`rdata.conversion.build_r_data`, - and :func:`rdata.unparser.unparse_file`, + This is a convenience function that wraps conversion and unparsing as it is the common use case. Args: @@ -104,12 +99,13 @@ def write_rda( >>> data = {"name": "hello", "values": [1, 2.2, 3.3+4.4j]} >>> rdata.write_rda("test.rda", data) """ - r_object = ConverterFromPythonToR(encoding=encoding).convert_to_r_object_for_rda(data) - r_data = build_r_data( - r_object, + converter = ConverterFromPythonToR( encoding=encoding, format_version=format_version, ) + r_object = converter.convert_to_r_object_for_rda(data) + r_data = converter.build_r_data(r_object) + unparse_file( path, r_data, diff --git a/rdata/conversion/__init__.py b/rdata/conversion/__init__.py index 2ec4f44..e802758 100644 --- a/rdata/conversion/__init__.py +++ b/rdata/conversion/__init__.py @@ -25,6 +25,5 @@ ts_constructor as ts_constructor, ) from .to_r import ( - build_r_data as build_r_data, ConverterFromPythonToR as ConverterFromPythonToR, ) diff --git a/rdata/conversion/to_r.py b/rdata/conversion/to_r.py index 4743e9f..c8ec7e0 100644 --- a/rdata/conversion/to_r.py +++ b/rdata/conversion/to_r.py @@ -51,6 +51,7 @@ def __call__(self, data: Any, *, encoding: Encoding) -> RObject: # noqa: ANN401 2: 0x20300, 3: 0x30500, }) +R_MINIMUM_VERSION_WITH_ENCODING: Final[int] = 3 def create_unicode_array( @@ -136,54 +137,55 @@ def build_r_object( ) -def build_r_data( - r_object: RObject, - *, - encoding: Encoding = "utf-8", - format_version: int = DEFAULT_FORMAT_VERSION, - r_version_serialized: int = DEFAULT_R_VERSION_SERIALIZED, -) -> RData: + +class ConverterFromPythonToR: """ - Build RData object from R object. + Class converting Python objects to R objects. Args: - r_object: R object. - encoding: Encoding saved in the metadata. + encoding: Encoding to be used for strings within data. format_version: File format version. r_version_serialized: R version written as the creator of the object. + """ - Returns: - Corresponding RData object. + def __init__(self, *, + encoding: Encoding = "utf-8", + format_version: int = DEFAULT_FORMAT_VERSION, + r_version_serialized: int = DEFAULT_R_VERSION_SERIALIZED, + ) -> None: + self.encoding = encoding + self.format_version = format_version + self.r_version_serialized = r_version_serialized + self.reference_name_list = [None] + self.reference_obj_list = [None] - See Also: - convert_to_r_object - """ - versions = RVersions( - format_version, - r_version_serialized, - R_MINIMUM_VERSIONS[format_version], - ) - minimum_version_with_encoding = 3 - extra = (RExtraInfo(encoding.upper()) - if versions.format >= minimum_version_with_encoding - else RExtraInfo(None)) + def build_r_data(self, + r_object: RObject, + ) -> RData: + """ + Build RData object from R object. - return RData(versions, extra, r_object) + Args: + r_object: R object. + Returns: + Corresponding RData object. -class ConverterFromPythonToR: - """ - Class converting Python objects to R objects. + See Also: + convert_to_r_object + """ + versions = RVersions( + self.format_version, + self.r_version_serialized, + R_MINIMUM_VERSIONS[self.format_version], + ) - Args: - encoding: Encoding to be used for strings within data. - """ + extra = (RExtraInfo(self.encoding.upper()) + if versions.format >= R_MINIMUM_VERSION_WITH_ENCODING + else RExtraInfo(None)) - def __init__(self, *, encoding: Encoding = "utf-8"): - self.encoding = encoding - self.reference_name_list = [None] - self.reference_obj_list = [None] + return RData(versions, extra, r_object) def build_r_list(self, diff --git a/rdata/tests/test_write.py b/rdata/tests/test_write.py index a46d8db..0f814c1 100644 --- a/rdata/tests/test_write.py +++ b/rdata/tests/test_write.py @@ -10,7 +10,7 @@ import pytest import rdata -from rdata.conversion import ConverterFromPythonToR, build_r_data +from rdata.conversion import ConverterFromPythonToR from rdata.unparser import unparse_data if TYPE_CHECKING: @@ -128,17 +128,16 @@ def test_convert_to_r(fname: str) -> None: encoding = encoding.lower() # type: ignore [assignment] try: - converter = ConverterFromPythonToR(encoding=encoding) - if file_type == "rds": - r_obj = converter.convert_to_r_object(py_data) - else: - r_obj = converter.convert_to_r_object_for_rda(py_data) - new_r_data = build_r_data( - r_obj, + converter = ConverterFromPythonToR( encoding=encoding, format_version=r_data.versions.format, r_version_serialized=r_data.versions.serialized, ) + if file_type == "rds": + r_obj = converter.convert_to_r_object(py_data) + else: + r_obj = converter.convert_to_r_object_for_rda(py_data) + new_r_data = converter.build_r_data(r_obj) except NotImplementedError as e: pytest.xfail(str(e)) @@ -167,7 +166,7 @@ def test_unparse_bad_rda() -> None: py_data = "hello" converter = ConverterFromPythonToR() r_obj = converter.convert_to_r_object(py_data) - r_data = build_r_data(r_obj) + r_data = converter.build_r_data(r_obj) with pytest.raises(ValueError, match="(?i)must be dictionary-like"): unparse_data(r_data, file_type="rda") @@ -191,7 +190,7 @@ def test_unparse_big_int() -> None: big_int = 2**32 converter = ConverterFromPythonToR() r_obj = converter.convert_to_r_object(big_int) - r_data = build_r_data(r_obj) + r_data = converter.build_r_data(r_obj) with pytest.raises(ValueError, match="(?i)not castable"): unparse_data(r_data, file_format="xdr") From 963a9bc7d2dd66af17ef7065a9220a9a38431de5 Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Thu, 12 Sep 2024 08:08:38 +0300 Subject: [PATCH 016/100] Convert range to array for old format --- rdata/conversion/to_r.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/rdata/conversion/to_r.py b/rdata/conversion/to_r.py index c8ec7e0..b85b6d2 100644 --- a/rdata/conversion/to_r.py +++ b/rdata/conversion/to_r.py @@ -396,6 +396,11 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 r_value = data elif isinstance(data, range): + if self.format_version < 3: + # ALTREP support is from R version 3.5.0 + # (minimum version for format version 3) + return self.convert_to_r_object(np.array(data)) + if data.step != 1: # R supports compact sequences only with step 1; # convert the range to an array of values From 61a2ea22774504a7aaf6f88c3abb9b5ce13143bc Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Thu, 12 Sep 2024 08:49:46 +0300 Subject: [PATCH 017/100] Fix ruff --- rdata/conversion/to_r.py | 46 ++++++++++++++++++++++++++------------- rdata/tests/test_write.py | 8 +++---- 2 files changed, 35 insertions(+), 19 deletions(-) diff --git a/rdata/conversion/to_r.py b/rdata/conversion/to_r.py index b85b6d2..a3041de 100644 --- a/rdata/conversion/to_r.py +++ b/rdata/conversion/to_r.py @@ -26,11 +26,11 @@ ) if TYPE_CHECKING: - import numpy.typing as npt - from collections.abc import Mapping from typing import Any, Final, Literal, Protocol + import numpy.typing as npt + Encoding = Literal["utf-8", "cp1252"] @@ -52,10 +52,11 @@ def __call__(self, data: Any, *, encoding: Encoding) -> RObject: # noqa: ANN401 3: 0x30500, }) R_MINIMUM_VERSION_WITH_ENCODING: Final[int] = 3 +R_MINIMUM_VERSION_WITH_ALTREP: Final[int] = 3 def create_unicode_array( - names: Any, + names: Any, # noqa: ANN401 ) -> npt.NDArray[Any]: """ Create unicode array from sequence/iterator of strings. @@ -73,7 +74,7 @@ def create_unicode_array( return np.array(name_list, dtype=np.dtype("U")) -def find_is_object(attributes: RObject | None): +def find_is_object(attributes: RObject | None) -> bool: if attributes is None: return False info = attributes.info @@ -119,7 +120,10 @@ def build_r_object( """ assert r_type is not None reference_id, referenced_object = reference - assert (reference_id == 0) == (referenced_object == None) == (r_type != RObjectType.REF) + assert ((reference_id == 0) + == (referenced_object is None) + == (r_type != RObjectType.REF) + ) is_object = find_is_object(attributes) return RObject( RObjectInfo( @@ -142,17 +146,24 @@ class ConverterFromPythonToR: """ Class converting Python objects to R objects. - Args: + Attributes: encoding: Encoding to be used for strings within data. format_version: File format version. r_version_serialized: R version written as the creator of the object. """ - def __init__(self, *, encoding: Encoding = "utf-8", format_version: int = DEFAULT_FORMAT_VERSION, r_version_serialized: int = DEFAULT_R_VERSION_SERIALIZED, ) -> None: + """ + Init class. + + Args: + encoding: Encoding to be used for strings within data. + format_version: File format version. + r_version_serialized: R version written as the creator of the object. + """ self.encoding = encoding self.format_version = format_version self.r_version_serialized = r_version_serialized @@ -321,7 +332,7 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 if len(data.attributes) > 0: # The following might work here (untested) - # attributes = build_r_list(data.attributes) # noqa: ERA001,E501 + # attributes = build_r_list(data.attributes) # noqa: ERA001 msg = f"type {r_type} with attributes not implemented" raise NotImplementedError(msg) @@ -350,7 +361,8 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 elif data.dtype.kind in ["U"]: assert data.ndim == 1 - data = np.array([s.encode(self.encoding) for s in data], dtype=np.dtype("S")) + data = np.array([s.encode(self.encoding) for s in data], + dtype=np.dtype("S")) return self.convert_to_r_object(data) else: @@ -396,7 +408,7 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 r_value = data elif isinstance(data, range): - if self.format_version < 3: + if self.format_version < R_MINIMUM_VERSION_WITH_ALTREP: # ALTREP support is from R version 3.5.0 # (minimum version for format version 3) return self.convert_to_r_object(np.array(data)) @@ -422,7 +434,7 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 ) elif isinstance(data, pd.Series): - msg = f"pd.Series not implemented" + msg = "pd.Series not implemented" raise NotImplementedError(msg) elif isinstance(data, pd.Categorical): @@ -446,8 +458,10 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 r_series = self.convert_to_r_object(array) elif isinstance(array, pd.arrays.StringArray): r_series = self.convert_to_r_object(create_unicode_array(array)) - elif (isinstance(array, pd.arrays.IntegerArray) - or isinstance(array, pd.arrays.NumpyExtensionArray)): + elif isinstance(array, ( + pd.arrays.IntegerArray, + pd.arrays.NumpyExtensionArray, + )): r_series = self.convert_to_r_object(array.to_numpy()) else: msg = f"pd.DataFrame with pd.Series {type(array)} not implemented" @@ -471,7 +485,7 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 row_names = range(index.start, index.stop, index.step) elif isinstance(index, pd.Index): attr_order = ["names", "class", "row.names"] - if index.dtype == 'object': + if index.dtype == "object": row_names = create_unicode_array(index) elif np.issubdtype(index.dtype, np.integer): row_names = index.to_numpy() @@ -494,4 +508,6 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 msg = f"type {type(data)} not implemented" raise NotImplementedError(msg) - return build_r_object(r_type, value=r_value, attributes=attributes, tag=tag, gp=gp) + return build_r_object(r_type, value=r_value, + attributes=attributes, + tag=tag, gp=gp) diff --git a/rdata/tests/test_write.py b/rdata/tests/test_write.py index 0f814c1..07ac786 100644 --- a/rdata/tests/test_write.py +++ b/rdata/tests/test_write.py @@ -148,16 +148,16 @@ def test_convert_to_r(fname: str) -> None: def test_convert_to_r_bad_rda() -> None: """Test checking that data for RDA has variable names.""" py_data = "hello" + converter = ConverterFromPythonToR() with pytest.raises(TypeError, match="(?i)data must be a dictionary"): - converter = ConverterFromPythonToR() converter.convert_to_r_object_for_rda(py_data) # type: ignore [arg-type] def test_convert_to_r_empty_rda() -> None: """Test checking that data for RDA has variable names.""" py_data: dict[str, Any] = {} + converter = ConverterFromPythonToR() with pytest.raises(ValueError, match="(?i)data must not be empty"): - converter = ConverterFromPythonToR() converter.convert_to_r_object_for_rda(py_data) @@ -173,15 +173,15 @@ def test_unparse_bad_rda() -> None: def test_convert_to_r_bad_encoding() -> None: """Test checking encoding.""" + converter = ConverterFromPythonToR(encoding="non-existent") with pytest.raises(LookupError, match="(?i)unknown encoding"): - converter = ConverterFromPythonToR(encoding="non-existent") converter.convert_to_r_object("ä") # type: ignore [arg-type] def test_convert_to_r_unsupported_encoding() -> None: """Test checking encoding.""" + converter = ConverterFromPythonToR(encoding="cp1250") with pytest.raises(ValueError, match="(?i)unsupported encoding"): - converter = ConverterFromPythonToR(encoding="cp1250") converter.convert_to_r_object("ä") # type: ignore [arg-type] From 937908bef933a12b1decb21a87fd1b4a58885ee6 Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Thu, 12 Sep 2024 08:52:57 +0300 Subject: [PATCH 018/100] Set object flag explicitly --- rdata/conversion/to_r.py | 26 +++++++------------------- 1 file changed, 7 insertions(+), 19 deletions(-) diff --git a/rdata/conversion/to_r.py b/rdata/conversion/to_r.py index a3041de..c6471c0 100644 --- a/rdata/conversion/to_r.py +++ b/rdata/conversion/to_r.py @@ -74,27 +74,11 @@ def create_unicode_array( return np.array(name_list, dtype=np.dtype("U")) -def find_is_object(attributes: RObject | None) -> bool: - if attributes is None: - return False - info = attributes.info - if info.type != RObjectType.LIST: - return False - if not info.tag: - return False - tag = attributes.tag - if tag.info.type == RObjectType.REF: - tag = tag.referenced_object - if (tag.info.type == RObjectType.SYM - and tag.value.value == b"class"): - return True - return find_is_object(attributes.value[1]) - - def build_r_object( r_type: RObjectType, *, value: Any = None, # noqa: ANN401 + is_object: bool = False, attributes: RObject | None = None, tag: RObject | None = None, gp: int = 0, @@ -106,6 +90,7 @@ def build_r_object( Args: r_type: Type indentifier. value: Value for RObject. + is_object: True if RObject represents object. attributes: Same as in RObject. tag: Same as in RObject. gp: Same as in RObjectInfo. @@ -124,7 +109,6 @@ def build_r_object( == (referenced_object is None) == (r_type != RObjectType.REF) ) - is_object = find_is_object(attributes) return RObject( RObjectInfo( r_type, @@ -312,9 +296,10 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 r_type = None values: list[Any] | tuple[Any, ...] r_value: Any = None - gp = 0 + is_object = False attributes = None tag = None + gp = 0 if data is None: r_type = RObjectType.NILVALUE @@ -438,6 +423,7 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 raise NotImplementedError(msg) elif isinstance(data, pd.Categorical): + is_object = True r_type = RObjectType.INT r_value = data.codes + 1 attributes = self.build_r_list({ @@ -446,6 +432,7 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 }) elif isinstance(data, pd.DataFrame): + is_object = True r_type = RObjectType.VEC names = [] r_value = [] @@ -509,5 +496,6 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 raise NotImplementedError(msg) return build_r_object(r_type, value=r_value, + is_object=is_object, attributes=attributes, tag=tag, gp=gp) From 8eda45413338efa212f9b62bd27c8a22b02cc1e7 Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Thu, 12 Sep 2024 09:20:16 +0300 Subject: [PATCH 019/100] Fix mypy --- rdata/conversion/to_r.py | 23 ++++++++++++----------- rdata/tests/test_write.py | 8 ++++---- 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/rdata/conversion/to_r.py b/rdata/conversion/to_r.py index c6471c0..415f94b 100644 --- a/rdata/conversion/to_r.py +++ b/rdata/conversion/to_r.py @@ -37,7 +37,7 @@ class Converter(Protocol): """Protocol for Py-to-R conversion.""" - def __call__(self, data: Any, *, encoding: Encoding) -> RObject: # noqa: ANN401 + def __call__(self, data: Any) -> RObject: # noqa: ANN401 """Convert Python object to R object.""" @@ -82,7 +82,7 @@ def build_r_object( attributes: RObject | None = None, tag: RObject | None = None, gp: int = 0, - reference: tuple(int, RObject | None) = (0, None), + reference: tuple[int, RObject | None] = (0, None), ) -> RObject: """ Build R object. @@ -151,8 +151,8 @@ def __init__(self, *, self.encoding = encoding self.format_version = format_version self.r_version_serialized = r_version_serialized - self.reference_name_list = [None] - self.reference_obj_list = [None] + self.reference_name_list: list[None | str] = [None] + self.reference_obj_list: list[None | RObject] = [None] def build_r_data(self, @@ -313,7 +313,7 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 values = data.elements r_value = (self.build_r_sym(str(values[0])), self.build_r_list(values[1:], - convert_value=self.build_r_sym)) + convert_value=self.build_r_sym)) # type: ignore [arg-type] if len(data.attributes) > 0: # The following might work here (untested) @@ -434,11 +434,11 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 elif isinstance(data, pd.DataFrame): is_object = True r_type = RObjectType.VEC - names = [] + column_names = [] r_value = [] for column, series in data.items(): assert isinstance(column, str) - names.append(column) + column_names.append(column) array = series.array if isinstance(array, pd.Categorical): @@ -447,7 +447,7 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 r_series = self.convert_to_r_object(create_unicode_array(array)) elif isinstance(array, ( pd.arrays.IntegerArray, - pd.arrays.NumpyExtensionArray, + pd.arrays.NumpyExtensionArray, # type: ignore [attr-defined] )): r_series = self.convert_to_r_object(array.to_numpy()) else: @@ -459,11 +459,12 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 index = data.index attr_order = ["names", "row.names", "class"] if isinstance(index, pd.RangeIndex): + assert isinstance(index.start, int) if (index.start == 1 and index.stop == data.shape[0] + 1 and index.step == 1 ): - row_names = np.ma.array( # type: ignore [no-untyped-call] + row_names = np.ma.array( data=[R_INT_NA, -data.shape[0]], mask=[True, False], fill_value=R_INT_NA, @@ -474,7 +475,7 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 attr_order = ["names", "class", "row.names"] if index.dtype == "object": row_names = create_unicode_array(index) - elif np.issubdtype(index.dtype, np.integer): + elif np.issubdtype(str(index.dtype), np.integer): row_names = index.to_numpy() else: msg = f"pd.DataFrame pd.Index {index.dtype} not implemented" @@ -484,7 +485,7 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 raise NotImplementedError(msg) attr_dict = { - "names": create_unicode_array(names), + "names": create_unicode_array(column_names), "row.names": row_names, "class": "data.frame", } diff --git a/rdata/tests/test_write.py b/rdata/tests/test_write.py index 07ac786..86e9762 100644 --- a/rdata/tests/test_write.py +++ b/rdata/tests/test_write.py @@ -173,16 +173,16 @@ def test_unparse_bad_rda() -> None: def test_convert_to_r_bad_encoding() -> None: """Test checking encoding.""" - converter = ConverterFromPythonToR(encoding="non-existent") + converter = ConverterFromPythonToR(encoding="non-existent") # type: ignore [arg-type] with pytest.raises(LookupError, match="(?i)unknown encoding"): - converter.convert_to_r_object("ä") # type: ignore [arg-type] + converter.convert_to_r_object("ä") def test_convert_to_r_unsupported_encoding() -> None: """Test checking encoding.""" - converter = ConverterFromPythonToR(encoding="cp1250") + converter = ConverterFromPythonToR(encoding="cp1250") # type: ignore [arg-type] with pytest.raises(ValueError, match="(?i)unsupported encoding"): - converter.convert_to_r_object("ä") # type: ignore [arg-type] + converter.convert_to_r_object("ä") def test_unparse_big_int() -> None: From efbb09d2af88fa2bbb0d02cae4923cbdaa384bcf Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Thu, 12 Sep 2024 09:48:17 +0300 Subject: [PATCH 020/100] Add tests for different dataframe index types --- .../data/test_dataframe_int_rownames.rds | Bin 0 -> 123 bytes .../data/test_dataframe_range_rownames.rds | Bin 0 -> 163 bytes rdata/tests/test_rdata.py | 30 ++++++++++++++++++ 3 files changed, 30 insertions(+) create mode 100644 rdata/tests/data/test_dataframe_int_rownames.rds create mode 100644 rdata/tests/data/test_dataframe_range_rownames.rds diff --git a/rdata/tests/data/test_dataframe_int_rownames.rds b/rdata/tests/data/test_dataframe_int_rownames.rds new file mode 100644 index 0000000000000000000000000000000000000000..74772a2ea72e7b5f2c3d9973ed8f805cd58752fd GIT binary patch literal 123 zcmb2|=3oE==I#ec2?+^l35kr8);Op!XJ>TGUdK8?o#%-GlcBsvV>1`CgvAVpOlvlc zghZy*$%zkGGS7LR@zrHBVwu1aIq7pT&qYzS=PF{0wV!x->Udu9)@GX3xNOd5jr}qF ZX=eI2jtZP%o5b>{o8dU|?WoU||80tUx9MYiNj@t_6@M4CF8ZF&{{Qg9-x} zIG8|bI|e9VVFIgTVc-O5&P&WqEe0|KkOWzh^K%T*6(;8-7NaZVN=Yn9)JrP@nt`UA zvnan@4`%i|AV!wtOU} None: ), ) + def test_dataframe_int_rownames(self) -> None: + """Test dataframe conversion.""" + # File created in R with + # df = data.frame(col1=c(10, 20, 30), row.names=c(3L, 6L, 9L)); saveRDS(df, file="test_dataframe_int_rownames.rds") # noqa: E501 + data = rdata.read_rda(TESTDATA_PATH / "test_dataframe_int_rownames.rds") + + index = np.array([3, 6, 9], dtype=np.int32) + ref = pd.DataFrame( + { + "col1": pd.Series([10., 20., 30.], dtype=pd.Float64Dtype(), index=index), + }, + index=index, + ) + pd.testing.assert_frame_equal(data, ref) + + def test_dataframe_range_rownames(self) -> None: + """Test dataframe conversion.""" + # File created in R with + # df = data.frame(col1=c(10, 20, 30), row.names=2:4); saveRDS(df, file="test_dataframe_range_rownames.rds") # noqa: E501 + data = rdata.read_rda(TESTDATA_PATH / "test_dataframe_range_rownames.rds") + + index = pd.RangeIndex(2, 5) + ref = pd.DataFrame( + { + "col1": pd.Series([10., 20., 30.], dtype=pd.Float64Dtype(), index=index), + }, + index=index, + ) + pd.testing.assert_frame_equal(data, ref) + def test_ts(self) -> None: """Test time series conversion.""" data = rdata.read_rda(TESTDATA_PATH / "test_ts.rda") From 32a2cc6175c1a6985f427fc782bd3e238119aba5 Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Thu, 12 Sep 2024 10:13:40 +0300 Subject: [PATCH 021/100] Test converting expanded altrep --- rdata/tests/test_write.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rdata/tests/test_write.py b/rdata/tests/test_write.py index 86e9762..a5a2127 100644 --- a/rdata/tests/test_write.py +++ b/rdata/tests/test_write.py @@ -113,7 +113,7 @@ def test_convert_to_r(fname: str) -> None: data = decompress_data(f.read()) file_type, file_format = parse_file_type_and_format(data) - r_data = rdata.parser.parse_data(data, expand_altrep=False) + r_data = rdata.parser.parse_data(data) try: py_data = rdata.conversion.convert(r_data) From 1f4e8d824c3d0627366887d4d46db7f627b80d6e Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Thu, 12 Sep 2024 10:14:28 +0300 Subject: [PATCH 022/100] Add only non-nil attributes to expanded altrep --- rdata/parser/_parser.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/rdata/parser/_parser.py b/rdata/parser/_parser.py index 902484b..6b9808f 100644 --- a/rdata/parser/_parser.py +++ b/rdata/parser/_parser.py @@ -941,7 +941,10 @@ def parse_R_object( # noqa: N802, C901, PLR0912, PLR0915 info=altrep_info, state=altrep_state, ) - attributes = altrep_attr + if altrep_attr.info.type != RObjectType.NILVALUE: + info.attributes = True + attributes_read = True + attributes = altrep_attr else: value = (altrep_info, altrep_state, altrep_attr) From 237bc22cc6bf10fcb81268ddd178d7a5f3b094b6 Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Thu, 12 Sep 2024 11:04:05 +0300 Subject: [PATCH 023/100] Enable general rangeindex in dataframe --- rdata/conversion/_conversion.py | 51 ++++++++++++++++++++++++++++++++- 1 file changed, 50 insertions(+), 1 deletion(-) diff --git a/rdata/conversion/_conversion.py b/rdata/conversion/_conversion.py index 7ad0957..fa846bf 100644 --- a/rdata/conversion/_conversion.py +++ b/rdata/conversion/_conversion.py @@ -394,6 +394,52 @@ def convert_array( return value # type: ignore [no-any-return] +def convert_altrep_to_range( + r_altrep: parser.RObject, +) -> range: + """ + Convert a R altrep to range object. + + Args: + r_altrep: R altrep object + + Returns: + Array. + + See Also: + convert_array + """ + if r_altrep.info.type != parser.RObjectType.ALTREP: + msg = "Must receive an altrep object" + raise TypeError(msg) + + info, state, attr = r_altrep.value + assert attr.info.type == parser.RObjectType.NILVALUE + + assert info.info.type == parser.RObjectType.LIST + + class_sym = info.value[0] + while class_sym.info.type == parser.RObjectType.REF: + class_sym = class_sym.referenced_object + + assert class_sym.info.type == parser.RObjectType.SYM + assert class_sym.value.info.type == parser.RObjectType.CHAR + + altrep_name = class_sym.value.value + assert isinstance(altrep_name, bytes) + + if altrep_name != b"compact_intseq": + msg = "Only compact integer sequences can be converted to range" + raise NotImplementedError(msg) + + n = int(state.value[0]) + start = int(state.value[1]) + step = int(state.value[2]) + stop = start + (n - 1) * step + value = range(start, stop + 1, step) + return value + + R_INT_MIN = -2**31 @@ -430,7 +476,7 @@ def dataframe_constructor( and isinstance(row_names, np.ma.MaskedArray) and row_names.mask[0] ) - else tuple(row_names) + else row_names ) return pd.DataFrame(obj, columns=obj, index=index) @@ -820,6 +866,9 @@ def _convert_next( # noqa: C901, PLR0912, PLR0915 value = None + elif obj.info.type == parser.RObjectType.ALTREP: + value = convert_altrep_to_range(obj) + else: msg = f"Type {obj.info.type} not implemented" raise NotImplementedError(msg) From 6859b8cabeff5e051f868d64f00587b4ccb0d18b Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Thu, 12 Sep 2024 11:05:42 +0300 Subject: [PATCH 024/100] Test conversion of altreps --- rdata/tests/test_write.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/rdata/tests/test_write.py b/rdata/tests/test_write.py index a5a2127..0fe7934 100644 --- a/rdata/tests/test_write.py +++ b/rdata/tests/test_write.py @@ -98,7 +98,8 @@ def test_unparse(fname: str) -> None: @pytest.mark.parametrize("fname", fnames, ids=fnames) -def test_convert_to_r(fname: str) -> None: +@pytest.mark.parametrize("expand_altrep", [True, False]) +def test_convert_to_r(fname: str, expand_altrep: bool) -> None: """Test converting Python data to RData object.""" with (TESTDATA_PATH / fname).open("rb") as f: # Skip test files without unique R->py->R transformation @@ -113,7 +114,7 @@ def test_convert_to_r(fname: str) -> None: data = decompress_data(f.read()) file_type, file_format = parse_file_type_and_format(data) - r_data = rdata.parser.parse_data(data) + r_data = rdata.parser.parse_data(data, expand_altrep=expand_altrep) try: py_data = rdata.conversion.convert(r_data) @@ -144,6 +145,18 @@ def test_convert_to_r(fname: str) -> None: assert str(r_data) == str(new_r_data) assert r_data == new_r_data + # Check futher that the resulting unparsed data is correct to ensure that + # Python-to-R conversion hasn't created any odd objects that can't be unparsed + if not expand_altrep: + file_type, file_format = parse_file_type_and_format(data) + out_data = unparse_data( + new_r_data, file_format=file_format, file_type=file_type) + + if file_format == "ascii": + data = data.replace(b"\r\n", b"\n") + + assert data == out_data + def test_convert_to_r_bad_rda() -> None: """Test checking that data for RDA has variable names.""" From 5ac49d0cae3d9c4dec8c2abdfed4ef88afab8bfd Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Thu, 12 Sep 2024 11:06:18 +0300 Subject: [PATCH 025/100] Change attribute order to match test files --- rdata/conversion/to_r.py | 1 + 1 file changed, 1 insertion(+) diff --git a/rdata/conversion/to_r.py b/rdata/conversion/to_r.py index 415f94b..b09a913 100644 --- a/rdata/conversion/to_r.py +++ b/rdata/conversion/to_r.py @@ -470,6 +470,7 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 fill_value=R_INT_NA, ) else: + attr_order = ["names", "class", "row.names"] row_names = range(index.start, index.stop, index.step) elif isinstance(index, pd.Index): attr_order = ["names", "class", "row.names"] From 6ad1408ebe60e17586b8bb1397cd4e1adc5d27ef Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Thu, 12 Sep 2024 11:18:19 +0300 Subject: [PATCH 026/100] Add comment about reordering attributes --- rdata/conversion/to_r.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/rdata/conversion/to_r.py b/rdata/conversion/to_r.py index b09a913..01ed7aa 100644 --- a/rdata/conversion/to_r.py +++ b/rdata/conversion/to_r.py @@ -456,8 +456,12 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 r_value.append(r_series) - index = data.index + # In test files the order in which attributes are written varies. + # We replicate here the order matching test files, but likely + # R could read files with attributes in any order. attr_order = ["names", "row.names", "class"] + + index = data.index if isinstance(index, pd.RangeIndex): assert isinstance(index.start, int) if (index.start == 1 @@ -490,7 +494,6 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 "row.names": row_names, "class": "data.frame", } - attributes = self.build_r_list({k: attr_dict[k] for k in attr_order}) else: From 1c458ba617d0f3f895c0ccc4d984fecde9391b8a Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Thu, 12 Sep 2024 11:25:08 +0300 Subject: [PATCH 027/100] Fix ruff and mypy --- rdata/conversion/_conversion.py | 3 +-- rdata/tests/test_rdata.py | 4 ++-- rdata/tests/test_write.py | 2 +- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/rdata/conversion/_conversion.py b/rdata/conversion/_conversion.py index fa846bf..efbc41c 100644 --- a/rdata/conversion/_conversion.py +++ b/rdata/conversion/_conversion.py @@ -436,8 +436,7 @@ def convert_altrep_to_range( start = int(state.value[1]) step = int(state.value[2]) stop = start + (n - 1) * step - value = range(start, stop + 1, step) - return value + return range(start, stop + 1, step) R_INT_MIN = -2**31 diff --git a/rdata/tests/test_rdata.py b/rdata/tests/test_rdata.py index 898d6c9..47f07ff 100644 --- a/rdata/tests/test_rdata.py +++ b/rdata/tests/test_rdata.py @@ -512,7 +512,7 @@ def test_dataframe_int_rownames(self) -> None: """Test dataframe conversion.""" # File created in R with # df = data.frame(col1=c(10, 20, 30), row.names=c(3L, 6L, 9L)); saveRDS(df, file="test_dataframe_int_rownames.rds") # noqa: E501 - data = rdata.read_rda(TESTDATA_PATH / "test_dataframe_int_rownames.rds") + data = rdata.read_rds(TESTDATA_PATH / "test_dataframe_int_rownames.rds") index = np.array([3, 6, 9], dtype=np.int32) ref = pd.DataFrame( @@ -527,7 +527,7 @@ def test_dataframe_range_rownames(self) -> None: """Test dataframe conversion.""" # File created in R with # df = data.frame(col1=c(10, 20, 30), row.names=2:4); saveRDS(df, file="test_dataframe_range_rownames.rds") # noqa: E501 - data = rdata.read_rda(TESTDATA_PATH / "test_dataframe_range_rownames.rds") + data = rdata.read_rds(TESTDATA_PATH / "test_dataframe_range_rownames.rds") index = pd.RangeIndex(2, 5) ref = pd.DataFrame( diff --git a/rdata/tests/test_write.py b/rdata/tests/test_write.py index 0fe7934..aef384d 100644 --- a/rdata/tests/test_write.py +++ b/rdata/tests/test_write.py @@ -99,7 +99,7 @@ def test_unparse(fname: str) -> None: @pytest.mark.parametrize("fname", fnames, ids=fnames) @pytest.mark.parametrize("expand_altrep", [True, False]) -def test_convert_to_r(fname: str, expand_altrep: bool) -> None: +def test_convert_to_r(fname: str, expand_altrep: bool) -> None: # noqa: FBT001 """Test converting Python data to RData object.""" with (TESTDATA_PATH / fname).open("rb") as f: # Skip test files without unique R->py->R transformation From 92429caef84429a1ab3886f5bab92d0e0e1a02dd Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Thu, 12 Sep 2024 11:46:24 +0300 Subject: [PATCH 028/100] Add test for dataframe with different dtypes --- rdata/tests/data/test_dataframe_dtypes.rds | Bin 0 -> 217 bytes rdata/tests/test_rdata.py | 19 +++++++++++++++++++ 2 files changed, 19 insertions(+) create mode 100644 rdata/tests/data/test_dataframe_dtypes.rds diff --git a/rdata/tests/data/test_dataframe_dtypes.rds b/rdata/tests/data/test_dataframe_dtypes.rds new file mode 100644 index 0000000000000000000000000000000000000000..aeb9ffbb9d7556e11055fe05856abbaecf046e34 GIT binary patch literal 217 zcmV;~04Dz*iwFP!000001B>8dU|?WoU||80tUx9MYiNj@t_6@M4B`MWFIWKs14y9= z5X%8EA4tsp<4h2k<-iD~xzfM@C?^0hlz|gSGgiPDl`uvX)Hsl_j0_L}qWM8$4gwGX z2N5VO0i|W2v;ve?g6d>oVFK%i7|xoPn44M*bptC@kU2B21SX%Blb;CVuoagSW#*+r zc`Ql!`8iNFdvbnmK~8D~y6MR|iN!F}VS-#Ki6x18X+=Pb(Ufx*<(KQh90UqTrUoGK T|NsBLK None: ) pd.testing.assert_frame_equal(data, ref) + def test_dataframe_dtypes(self) -> None: + """Test dataframe conversion.""" + # File created in R with + # df = data.frame(int=c(10L, 20L, 30L), float=c(1.1, 2.2, 3.3), string=c("x", "y", "z"), bool=as.logical(c(1, 0, 1)), complex=c(4+5i, 6+7i, 8+9i)); print(df); saveRDS(df, file="test_dataframe_dtypes.rds") # noqa: E501 + data = rdata.read_rds(TESTDATA_PATH / "test_dataframe_dtypes.rds") + + index = pd.RangeIndex(1, 4) + ref = pd.DataFrame( + { + "int": pd.Series([10, 20, 30], dtype=pd.Int32Dtype(), index=index), + "float": pd.Series([1.1, 2.2, 3.3], dtype=pd.Float64Dtype(), index=index), + "string": pd.Series(["x" ,"y", "z"], dtype=pd.StringDtype(), index=index), + "bool": pd.Series([True, False, True], dtype=pd.BooleanDtype(), index=index), + "complex": pd.Series([4+5j, 6+7j, 8+9j], dtype=complex, index=index), + }, + index=index, + ) + pd.testing.assert_frame_equal(data, ref) + def test_ts(self) -> None: """Test time series conversion.""" data = rdata.read_rda(TESTDATA_PATH / "test_ts.rda") From 5cf678d055beb89a476c0ada7c95e54142607c17 Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Thu, 12 Sep 2024 11:48:16 +0300 Subject: [PATCH 029/100] Add conversion of boolean pd arrays --- rdata/conversion/to_r.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/rdata/conversion/to_r.py b/rdata/conversion/to_r.py index 01ed7aa..f583d3a 100644 --- a/rdata/conversion/to_r.py +++ b/rdata/conversion/to_r.py @@ -446,8 +446,9 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 elif isinstance(array, pd.arrays.StringArray): r_series = self.convert_to_r_object(create_unicode_array(array)) elif isinstance(array, ( - pd.arrays.IntegerArray, - pd.arrays.NumpyExtensionArray, # type: ignore [attr-defined] + pd.arrays.IntegerArray, + pd.arrays.BooleanArray, + pd.arrays.NumpyExtensionArray, # type: ignore [attr-defined] )): r_series = self.convert_to_r_object(array.to_numpy()) else: From f379fc97127a54caa9f1e289f0ce16e2dd52f819 Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Thu, 12 Sep 2024 14:25:08 +0300 Subject: [PATCH 030/100] Add test for pandas dtypes --- rdata/tests/test_write.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/rdata/tests/test_write.py b/rdata/tests/test_write.py index aef384d..9d24052 100644 --- a/rdata/tests/test_write.py +++ b/rdata/tests/test_write.py @@ -7,6 +7,8 @@ from pathlib import Path from typing import TYPE_CHECKING, Any +import numpy as np +import pandas as pd import pytest import rdata @@ -208,6 +210,37 @@ def test_unparse_big_int() -> None: unparse_data(r_data, file_format="xdr") +def test_convert_dataframe_pandas_dtypes() -> None: + """Test converting dataframe with pandas dtypes.""" + df1 = pd.DataFrame( + { + "int": np.array([10, 20, 30], dtype=np.int32), + "float": [1.1, 2.2, 3.3], + "string": ["x" ,"y", "z"], + "bool": [True, False, True], + "complex": [4+5j, 6+7j, 8+9j], + }, + index=range(3), + ) + + df2 = pd.DataFrame( + { + "int": pd.Series([10, 20, 30], dtype=pd.Int32Dtype()), + "float": pd.Series([1.1, 2.2, 3.3], dtype=pd.Float64Dtype()), + "string": pd.Series(["x" ,"y", "z"], dtype=pd.StringDtype()), + "bool": pd.Series([True, False, True], dtype=pd.BooleanDtype()), + "complex": pd.Series([4+5j, 6+7j, 8+9j], dtype=complex), + }, + index=pd.RangeIndex(3), + ) + + r_obj1 = ConverterFromPythonToR().convert_to_r_object(df1) + r_obj2 = ConverterFromPythonToR().convert_to_r_object(df2) + + assert str(r_obj1) == str(r_obj2) + assert r_obj1 == r_obj2 + + @pytest.mark.parametrize("compression", [*valid_compressions, "fail"]) @pytest.mark.parametrize("file_format", [*valid_formats, None, "fail"]) @pytest.mark.parametrize("file_type", ["rds", "rda"]) From ddabf65432b7ded9fc5bb068410832dd58207e1c Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Thu, 12 Sep 2024 14:25:37 +0300 Subject: [PATCH 031/100] Add missing conversions --- rdata/conversion/to_r.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/rdata/conversion/to_r.py b/rdata/conversion/to_r.py index f583d3a..e976897 100644 --- a/rdata/conversion/to_r.py +++ b/rdata/conversion/to_r.py @@ -446,11 +446,16 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 elif isinstance(array, pd.arrays.StringArray): r_series = self.convert_to_r_object(create_unicode_array(array)) elif isinstance(array, ( - pd.arrays.IntegerArray, pd.arrays.BooleanArray, + pd.arrays.IntegerArray, + pd.arrays.FloatingArray, pd.arrays.NumpyExtensionArray, # type: ignore [attr-defined] )): - r_series = self.convert_to_r_object(array.to_numpy()) + np_array = array.to_numpy() + if np_array.dtype.kind == "O": + r_series = self.convert_to_r_object(create_unicode_array(array)) + else: + r_series = self.convert_to_r_object(array.to_numpy()) else: msg = f"pd.DataFrame with pd.Series {type(array)} not implemented" raise NotImplementedError(msg) From 9dd2559b00fd7f3e0f9d39ec40b29af8713b7a0b Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Thu, 12 Sep 2024 14:48:53 +0300 Subject: [PATCH 032/100] Set dataframe attribute order file-by-file --- rdata/conversion/to_r.py | 16 ++++++++-------- rdata/tests/test_write.py | 21 +++++++++++++++------ 2 files changed, 23 insertions(+), 14 deletions(-) diff --git a/rdata/conversion/to_r.py b/rdata/conversion/to_r.py index e976897..efc1f48 100644 --- a/rdata/conversion/to_r.py +++ b/rdata/conversion/to_r.py @@ -154,6 +154,11 @@ def __init__(self, *, self.reference_name_list: list[None | str] = [None] self.reference_obj_list: list[None | RObject] = [None] + # In test files the order in which dataframe attributes are written varies. + # R can read files with attributes in any order, but this variable + # is used in tests to change the attribute order to match with the test file. + self.df_attr_order = None + def build_r_data(self, r_object: RObject, @@ -462,11 +467,6 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 r_value.append(r_series) - # In test files the order in which attributes are written varies. - # We replicate here the order matching test files, but likely - # R could read files with attributes in any order. - attr_order = ["names", "row.names", "class"] - index = data.index if isinstance(index, pd.RangeIndex): assert isinstance(index.start, int) @@ -480,10 +480,8 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 fill_value=R_INT_NA, ) else: - attr_order = ["names", "class", "row.names"] row_names = range(index.start, index.stop, index.step) elif isinstance(index, pd.Index): - attr_order = ["names", "class", "row.names"] if index.dtype == "object": row_names = create_unicode_array(index) elif np.issubdtype(str(index.dtype), np.integer): @@ -500,7 +498,9 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 "row.names": row_names, "class": "data.frame", } - attributes = self.build_r_list({k: attr_dict[k] for k in attr_order}) + if self.df_attr_order is not None: + attr_dict = {k: attr_dict[k] for k in self.df_attr_order} + attributes = self.build_r_list(attr_dict) else: msg = f"type {type(data)} not implemented" diff --git a/rdata/tests/test_write.py b/rdata/tests/test_write.py index 9d24052..fe67aef 100644 --- a/rdata/tests/test_write.py +++ b/rdata/tests/test_write.py @@ -130,20 +130,29 @@ def test_convert_to_r(fname: str, expand_altrep: bool) -> None: # noqa: FBT001 else: encoding = encoding.lower() # type: ignore [assignment] + converter = ConverterFromPythonToR( + encoding=encoding, + format_version=r_data.versions.format, + r_version_serialized=r_data.versions.serialized, + ) + if fname in [ + "test_dataframe_dtypes.rds", + "test_dataframe_int_rownames.rds", + "test_dataframe_range_rownames.rds", + "test_dataframe_rownames.rda", + ]: + converter.df_attr_order = ["names", "class", "row.names"] + try: - converter = ConverterFromPythonToR( - encoding=encoding, - format_version=r_data.versions.format, - r_version_serialized=r_data.versions.serialized, - ) if file_type == "rds": r_obj = converter.convert_to_r_object(py_data) else: r_obj = converter.convert_to_r_object_for_rda(py_data) - new_r_data = converter.build_r_data(r_obj) except NotImplementedError as e: pytest.xfail(str(e)) + new_r_data = converter.build_r_data(r_obj) + assert str(r_data) == str(new_r_data) assert r_data == new_r_data From 865227183237f2a2c715aee0d208479bf0ab8135 Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Thu, 12 Sep 2024 15:05:03 +0300 Subject: [PATCH 033/100] Add test for dataframe with NAs --- .../data/test_dataframe_dtypes_with_na.rds | Bin 0 -> 235 bytes rdata/tests/test_rdata.py | 19 ++++++++++++++++++ 2 files changed, 19 insertions(+) create mode 100644 rdata/tests/data/test_dataframe_dtypes_with_na.rds diff --git a/rdata/tests/data/test_dataframe_dtypes_with_na.rds b/rdata/tests/data/test_dataframe_dtypes_with_na.rds new file mode 100644 index 0000000000000000000000000000000000000000..17a170c0e2b53e2b70a11b4ba4c44dae95f48313 GIT binary patch literal 235 zcmV8dU|?WoU||80tUx9MYiNj@t_6@M4B`MWFAxJ|89)j} zfLN}90Sx#+;`SeBg1{^XMkvjd1_t#XKuXvb0SN)HeijB!AkA0-V^qQzRX~FCKM+8z z09nNdv4MdRY85|7+(7`saS(yh5>Q$ON-IEVB?yh=1Snu(0-Fu7m^CjkH? None: ) pd.testing.assert_frame_equal(data, ref) + def test_dataframe_dtypes_with_na(self) -> None: + """Test dataframe conversion.""" + # File created in R with + # df = data.frame(int=c(10L, 20L, 30L, NA), float=c(1.1, 2.2, 3.3, NA), string=c("x", "y", "z", NA), bool=as.logical(c(1, 0, 1, NA)), complex=c(4+5i, 6+7i, 8+9i, NA)); saveRDS(df, file="test_dataframe_dtypes_with_na.rds") + data = rdata.read_rds(TESTDATA_PATH / "test_dataframe_dtypes_with_na.rds") + + index = pd.RangeIndex(1, 5) + ref = pd.DataFrame( + { + "int": pd.Series([10, 20, 30, None], dtype=pd.Int32Dtype(), index=index), + "float": pd.Series([1.1, 2.2, 3.3, None], dtype=pd.Float64Dtype(), index=index), + "string": pd.Series(["x" ,"y", "z", None], dtype=pd.StringDtype(), index=index), + "bool": pd.Series([True, False, True, None], dtype=pd.BooleanDtype(), index=index), + "complex": pd.Series([4+5j, 6+7j, 8+9j, None], dtype=complex, index=index), + }, + index=index, + ) + pd.testing.assert_frame_equal(data, ref) + def test_ts(self) -> None: """Test time series conversion.""" data = rdata.read_rda(TESTDATA_PATH / "test_ts.rda") From 993e2ed0f07a84aaf4576af24a4b439593cf9f29 Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Thu, 12 Sep 2024 18:02:36 +0300 Subject: [PATCH 034/100] Add dataframe column transformation for more types --- rdata/conversion/_conversion.py | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/rdata/conversion/_conversion.py b/rdata/conversion/_conversion.py index efbc41c..24c5022 100644 --- a/rdata/conversion/_conversion.py +++ b/rdata/conversion/_conversion.py @@ -445,14 +445,26 @@ def convert_altrep_to_range( def _dataframe_column_transform(source: Any) -> Any: # noqa: ANN401 if isinstance(source, np.ndarray): + dtype: Any if np.issubdtype(source.dtype, np.integer): - return pd.Series(source, dtype=pd.Int32Dtype()).array - - if np.issubdtype(source.dtype, np.bool_): - return pd.Series(source, dtype=pd.BooleanDtype()).array + dtype = pd.Int32Dtype() + elif np.issubdtype(source.dtype, np.floating): + dtype = pd.Float64Dtype() + elif np.issubdtype(source.dtype, np.complexfloating): + # There seems to be no pandas type for complex array + return source + elif np.issubdtype(source.dtype, np.bool_): + dtype = pd.BooleanDtype() + elif np.issubdtype(source.dtype, np.str_): + dtype = pd.StringDtype() + elif np.issubdtype(source.dtype, np.object_): + for value in source: + assert isinstance(value, str) or value is None + dtype = pd.StringDtype() + else: + return source - if np.issubdtype(source.dtype, np.str_): - return pd.Series(source, dtype=pd.StringDtype()).array + return pd.Series(source, dtype=dtype).array return source From dc1950df66b485df1df2f4aaa9d2bf4affb86366 Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Thu, 12 Sep 2024 18:05:06 +0300 Subject: [PATCH 035/100] Fix NA values in dataframes --- rdata/conversion/to_r.py | 107 +++++++++++++++++++++++++-------------- rdata/parser/__init__.py | 1 + rdata/parser/_parser.py | 4 ++ 3 files changed, 74 insertions(+), 38 deletions(-) diff --git a/rdata/conversion/to_r.py b/rdata/conversion/to_r.py index efc1f48..372c749 100644 --- a/rdata/conversion/to_r.py +++ b/rdata/conversion/to_r.py @@ -10,6 +10,7 @@ import pandas as pd from rdata.parser import ( + R_FLOAT_NA, R_INT_NA, CharFlags, RData, @@ -55,23 +56,61 @@ def __call__(self, data: Any) -> RObject: # noqa: ANN401 R_MINIMUM_VERSION_WITH_ALTREP: Final[int] = 3 -def create_unicode_array( - names: Any, # noqa: ANN401 +def convert_pd_array_to_np_array( + pd_array: Any, ) -> npt.NDArray[Any]: """ - Create unicode array from sequence/iterator of strings. + Convert pandas array object to numpy array. Args: - names: Strings. + pd_array: Pandas array. Returns: - Array. + Numpy array. """ - name_list = [] - for name in names: - assert isinstance(name, str) - name_list.append(name) - return np.array(name_list, dtype=np.dtype("U")) + if isinstance(pd_array, pd.arrays.StringArray): + return pd_array.to_numpy() + elif isinstance(pd_array, ( + pd.arrays.BooleanArray, + pd.arrays.IntegerArray, + pd.arrays.FloatingArray, + )): + if isinstance(pd_array, pd.arrays.BooleanArray): + dtype = np.bool_ + fill_value = True + elif isinstance(pd_array, pd.arrays.IntegerArray): + dtype = np.int32 + fill_value = R_INT_NA + elif isinstance(pd_array, pd.arrays.FloatingArray): + dtype = np.float64 + fill_value = R_FLOAT_NA + + mask = pd_array.isna() + if np.any(mask): + data = np.empty(pd_array.shape, dtype=dtype) + data[~mask] = pd_array[~mask].to_numpy() + data[mask] = fill_value + if isinstance(pd_array, pd.arrays.FloatingArray): + array = data + else: + array = np.ma.array( + data=data, + mask=mask, + fill_value=fill_value, + ) + else: + array = pd_array.to_numpy() + assert array.dtype == dtype + return array + + elif isinstance(pd_array, ( + pd.arrays.NumpyExtensionArray, # type: ignore [attr-defined] + )): + array = pd_array.to_numpy() + return array + + msg = f"pandas array {type(array)} not implemented" + raise NotImplementedError(msg) def build_r_object( @@ -332,17 +371,23 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 r_value = [self.convert_to_r_object(el) for el in values] if isinstance(data, dict): - names = create_unicode_array(data.keys()) + names = np.array(list(data.keys()), dtype=np.dtype("U")) attributes = self.build_r_list({"names": names}) elif isinstance(data, np.ndarray): if data.dtype.kind in ["O"]: - # This is a special case handling only np.array([None]) - if data.size != 1 or data[0] is not None: - msg = "general object array not implemented" - raise NotImplementedError(msg) + assert data.ndim == 1 r_type = RObjectType.STR - r_value = [build_r_object(RObjectType.CHAR)] + r_value = [] + for el in data: + if el is None or pd.isna(el): + r_el = build_r_object(RObjectType.CHAR) + elif isinstance(el, str): + r_el = self.convert_to_r_object(el.encode(self.encoding)) + else: + msg = "general object array not implemented" + raise NotImplementedError(msg) + r_value.append(r_el) elif data.dtype.kind in ["S"]: assert data.ndim == 1 @@ -432,7 +477,7 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 r_type = RObjectType.INT r_value = data.codes + 1 attributes = self.build_r_list({ - "levels": create_unicode_array(data.categories), + "levels": data.categories.to_numpy(), "class": "factor", }) @@ -445,26 +490,12 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 assert isinstance(column, str) column_names.append(column) - array = series.array - if isinstance(array, pd.Categorical): - r_series = self.convert_to_r_object(array) - elif isinstance(array, pd.arrays.StringArray): - r_series = self.convert_to_r_object(create_unicode_array(array)) - elif isinstance(array, ( - pd.arrays.BooleanArray, - pd.arrays.IntegerArray, - pd.arrays.FloatingArray, - pd.arrays.NumpyExtensionArray, # type: ignore [attr-defined] - )): - np_array = array.to_numpy() - if np_array.dtype.kind == "O": - r_series = self.convert_to_r_object(create_unicode_array(array)) - else: - r_series = self.convert_to_r_object(array.to_numpy()) + pd_array = series.array + if isinstance(pd_array, pd.Categorical): + array = pd_array else: - msg = f"pd.DataFrame with pd.Series {type(array)} not implemented" - raise NotImplementedError(msg) - + array = convert_pd_array_to_np_array(pd_array) + r_series = self.convert_to_r_object(array) r_value.append(r_series) index = data.index @@ -483,7 +514,7 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 row_names = range(index.start, index.stop, index.step) elif isinstance(index, pd.Index): if index.dtype == "object": - row_names = create_unicode_array(index) + row_names = index.to_numpy() elif np.issubdtype(str(index.dtype), np.integer): row_names = index.to_numpy() else: @@ -494,7 +525,7 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 raise NotImplementedError(msg) attr_dict = { - "names": create_unicode_array(column_names), + "names": np.array(column_names, dtype=np.dtype("U")), "row.names": row_names, "class": "data.frame", } diff --git a/rdata/parser/__init__.py b/rdata/parser/__init__.py index d62b6e9..683d039 100644 --- a/rdata/parser/__init__.py +++ b/rdata/parser/__init__.py @@ -2,6 +2,7 @@ from ._parser import ( DEFAULT_ALTREP_MAP as DEFAULT_ALTREP_MAP, + R_FLOAT_NA as R_FLOAT_NA, R_INT_NA as R_INT_NA, CharFlags as CharFlags, RData as RData, diff --git a/rdata/parser/_parser.py b/rdata/parser/_parser.py index 6b9808f..e88fc42 100644 --- a/rdata/parser/_parser.py +++ b/rdata/parser/_parser.py @@ -31,6 +31,10 @@ #: Value used to represent a missing integer in R. R_INT_NA: Final = -2**31 +#: Value used to represent a missing float in R. +# This is a NaN with a particular payload, but it's not the same as np.nan. +R_FLOAT_NA: Final = np.frombuffer(b"\x7f\xf0\x00\x00\x00\x00\x07\xa2", dtype=">f8").astype("=f8")[0] # noqa: E501 + @runtime_checkable class BinaryFileLike(Protocol): From 38c80d73685738ffc3c9be08f22bed7b86c3fca7 Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Thu, 12 Sep 2024 18:19:03 +0300 Subject: [PATCH 036/100] Fix dataframe attribute order --- rdata/tests/test_write.py | 1 + 1 file changed, 1 insertion(+) diff --git a/rdata/tests/test_write.py b/rdata/tests/test_write.py index fe67aef..86cca4d 100644 --- a/rdata/tests/test_write.py +++ b/rdata/tests/test_write.py @@ -140,6 +140,7 @@ def test_convert_to_r(fname: str, expand_altrep: bool) -> None: # noqa: FBT001 "test_dataframe_int_rownames.rds", "test_dataframe_range_rownames.rds", "test_dataframe_rownames.rda", + "test_dataframe_dtypes_with_na.rds", ]: converter.df_attr_order = ["names", "class", "row.names"] From b622c9cd48d9a724786a6c65cb84b3ac4b3d6ee3 Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Thu, 12 Sep 2024 18:28:58 +0300 Subject: [PATCH 037/100] Add NA floats to ascii parser --- rdata/parser/_ascii.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/rdata/parser/_ascii.py b/rdata/parser/_ascii.py index 15f59a7..976d4df 100644 --- a/rdata/parser/_ascii.py +++ b/rdata/parser/_ascii.py @@ -6,7 +6,15 @@ import numpy as np import numpy.typing as npt -from ._parser import R_INT_NA, AltRepConstructorMap, Parser +from ._parser import R_FLOAT_NA, R_INT_NA, AltRepConstructorMap, Parser + + +def map_int_na(line: str) -> int: + return R_INT_NA if line == "NA" else int(line) + + +def map_float_na(line: str) -> float: + return R_FLOAT_NA if line == "NA" else float(line) class ParserASCII(Parser): @@ -42,14 +50,16 @@ def _parse_array_values( line = self._readline() if np.issubdtype(dtype, np.integer): - value = R_INT_NA if line == "NA" else int(line) + value = map_int_na(line) elif np.issubdtype(dtype, np.floating): - value = float(line) + value = map_float_na(line) elif np.issubdtype(dtype, np.complexfloating): + value1 = map_float_na(line) line2 = self._readline() - value = complex(float(line), float(line2)) + value2 = map_float_na(line2) + value = complex(value1, value2) else: msg = f"Unknown dtype: {dtype}" From c2728e281c5fdf82d9a01eb63df1111f3218c40d Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Thu, 12 Sep 2024 18:30:11 +0300 Subject: [PATCH 038/100] Add NA floats to ascii unparser --- rdata/unparser/_ascii.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/rdata/unparser/_ascii.py b/rdata/unparser/_ascii.py index a20f8fc..a8bdc78 100644 --- a/rdata/unparser/_ascii.py +++ b/rdata/unparser/_ascii.py @@ -7,6 +7,8 @@ import numpy as np +from rdata.parser import R_FLOAT_NA + from ._unparser import Unparser if TYPE_CHECKING: @@ -15,6 +17,11 @@ import numpy.typing as npt +def is_float_na(value: float) -> bool: + """Check if value is NA value.""" + return np.array(value).tobytes() == np.array(R_FLOAT_NA).tobytes() + + class UnparserASCII(Unparser): """Unparser for files in ASCII format.""" @@ -35,7 +42,7 @@ def unparse_magic(self) -> None: """Unparse magic bits.""" self._add_line("A") - def _unparse_array_values(self, array: npt.NDArray[Any]) -> None: + def _unparse_array_values(self, array: npt.NDArray[Any]) -> None: # noqa: C901 # Convert boolean to int if np.issubdtype(array.dtype, np.bool_): array = array.astype(np.int32) @@ -51,7 +58,9 @@ def _unparse_array_values(self, array: npt.NDArray[Any]) -> None: line = "NA" if value is None or np.ma.is_masked(value) else str(value) # type: ignore [no-untyped-call] elif np.issubdtype(array.dtype, np.floating): - if np.isnan(value): + if is_float_na(value): + line = "NA" + elif np.isnan(value): line = "NaN" elif value == np.inf: line = "Inf" From e27217ca838f6afad63f35d467192fc22aaf8819 Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Thu, 12 Sep 2024 18:46:28 +0300 Subject: [PATCH 039/100] Fix ruff --- rdata/conversion/to_r.py | 15 ++++++------ rdata/tests/test_rdata.py | 50 +++++++++++++++++++++++++++++---------- 2 files changed, 44 insertions(+), 21 deletions(-) diff --git a/rdata/conversion/to_r.py b/rdata/conversion/to_r.py index 372c749..9de4417 100644 --- a/rdata/conversion/to_r.py +++ b/rdata/conversion/to_r.py @@ -57,7 +57,7 @@ def __call__(self, data: Any) -> RObject: # noqa: ANN401 def convert_pd_array_to_np_array( - pd_array: Any, + pd_array: Any, # noqa: ANN401 ) -> npt.NDArray[Any]: """ Convert pandas array object to numpy array. @@ -70,7 +70,8 @@ def convert_pd_array_to_np_array( """ if isinstance(pd_array, pd.arrays.StringArray): return pd_array.to_numpy() - elif isinstance(pd_array, ( + + if isinstance(pd_array, ( pd.arrays.BooleanArray, pd.arrays.IntegerArray, pd.arrays.FloatingArray, @@ -103,11 +104,10 @@ def convert_pd_array_to_np_array( assert array.dtype == dtype return array - elif isinstance(pd_array, ( + if isinstance(pd_array, ( pd.arrays.NumpyExtensionArray, # type: ignore [attr-defined] )): - array = pd_array.to_numpy() - return array + return pd_array.to_numpy() msg = f"pandas array {type(array)} not implemented" raise NotImplementedError(msg) @@ -513,9 +513,8 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 else: row_names = range(index.start, index.stop, index.step) elif isinstance(index, pd.Index): - if index.dtype == "object": - row_names = index.to_numpy() - elif np.issubdtype(str(index.dtype), np.integer): + if (index.dtype == "object" + or np.issubdtype(str(index.dtype), np.integer)): row_names = index.to_numpy() else: msg = f"pd.DataFrame pd.Index {index.dtype} not implemented" diff --git a/rdata/tests/test_rdata.py b/rdata/tests/test_rdata.py index dfe03d8..9e4d663 100644 --- a/rdata/tests/test_rdata.py +++ b/rdata/tests/test_rdata.py @@ -517,7 +517,9 @@ def test_dataframe_int_rownames(self) -> None: index = np.array([3, 6, 9], dtype=np.int32) ref = pd.DataFrame( { - "col1": pd.Series([10., 20., 30.], dtype=pd.Float64Dtype(), index=index), + "col1": pd.Series( + [10., 20., 30.], + dtype=pd.Float64Dtype(), index=index), }, index=index, ) @@ -532,7 +534,9 @@ def test_dataframe_range_rownames(self) -> None: index = pd.RangeIndex(2, 5) ref = pd.DataFrame( { - "col1": pd.Series([10., 20., 30.], dtype=pd.Float64Dtype(), index=index), + "col1": pd.Series( + [10., 20., 30.], + dtype=pd.Float64Dtype(), index=index), }, index=index, ) @@ -547,11 +551,21 @@ def test_dataframe_dtypes(self) -> None: index = pd.RangeIndex(1, 4) ref = pd.DataFrame( { - "int": pd.Series([10, 20, 30], dtype=pd.Int32Dtype(), index=index), - "float": pd.Series([1.1, 2.2, 3.3], dtype=pd.Float64Dtype(), index=index), - "string": pd.Series(["x" ,"y", "z"], dtype=pd.StringDtype(), index=index), - "bool": pd.Series([True, False, True], dtype=pd.BooleanDtype(), index=index), - "complex": pd.Series([4+5j, 6+7j, 8+9j], dtype=complex, index=index), + "int": pd.Series( + [10, 20, 30], + dtype=pd.Int32Dtype(), index=index), + "float": pd.Series( + [1.1, 2.2, 3.3], + dtype=pd.Float64Dtype(), index=index), + "string": pd.Series( + ["x" ,"y", "z"], + dtype=pd.StringDtype(), index=index), + "bool": pd.Series( + [True, False, True], + dtype=pd.BooleanDtype(), index=index), + "complex": pd.Series( + [4+5j, 6+7j, 8+9j], + dtype=complex, index=index), }, index=index, ) @@ -560,17 +574,27 @@ def test_dataframe_dtypes(self) -> None: def test_dataframe_dtypes_with_na(self) -> None: """Test dataframe conversion.""" # File created in R with - # df = data.frame(int=c(10L, 20L, 30L, NA), float=c(1.1, 2.2, 3.3, NA), string=c("x", "y", "z", NA), bool=as.logical(c(1, 0, 1, NA)), complex=c(4+5i, 6+7i, 8+9i, NA)); saveRDS(df, file="test_dataframe_dtypes_with_na.rds") + # df = data.frame(int=c(10L, 20L, 30L, NA), float=c(1.1, 2.2, 3.3, NA), string=c("x", "y", "z", NA), bool=as.logical(c(1, 0, 1, NA)), complex=c(4+5i, 6+7i, 8+9i, NA)); saveRDS(df, file="test_dataframe_dtypes_with_na.rds") # noqa: E501 data = rdata.read_rds(TESTDATA_PATH / "test_dataframe_dtypes_with_na.rds") index = pd.RangeIndex(1, 5) ref = pd.DataFrame( { - "int": pd.Series([10, 20, 30, None], dtype=pd.Int32Dtype(), index=index), - "float": pd.Series([1.1, 2.2, 3.3, None], dtype=pd.Float64Dtype(), index=index), - "string": pd.Series(["x" ,"y", "z", None], dtype=pd.StringDtype(), index=index), - "bool": pd.Series([True, False, True, None], dtype=pd.BooleanDtype(), index=index), - "complex": pd.Series([4+5j, 6+7j, 8+9j, None], dtype=complex, index=index), + "int": pd.Series( + [10, 20, 30, None], + dtype=pd.Int32Dtype(), index=index), + "float": pd.Series( + [1.1, 2.2, 3.3, None], + dtype=pd.Float64Dtype(), index=index), + "string": pd.Series( + ["x" ,"y", "z", None], + dtype=pd.StringDtype(), index=index), + "bool": pd.Series( + [True, False, True, None], + dtype=pd.BooleanDtype(), index=index), + "complex": pd.Series( + [4+5j, 6+7j, 8+9j, None], + dtype=complex, index=index), }, index=index, ) From 9da1a4286e51031de1bb164ab5408dfed04e6193 Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Fri, 13 Sep 2024 13:27:47 +0300 Subject: [PATCH 040/100] Define NA checker function close to the definition --- rdata/parser/__init__.py | 1 + rdata/parser/_parser.py | 5 +++++ rdata/unparser/_ascii.py | 7 +------ 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/rdata/parser/__init__.py b/rdata/parser/__init__.py index 683d039..98375fe 100644 --- a/rdata/parser/__init__.py +++ b/rdata/parser/__init__.py @@ -11,6 +11,7 @@ RObjectInfo as RObjectInfo, RObjectType as RObjectType, RVersions as RVersions, + is_float_na as is_float_na, parse_data as parse_data, parse_file as parse_file, ) diff --git a/rdata/parser/_parser.py b/rdata/parser/_parser.py index e88fc42..929f1ac 100644 --- a/rdata/parser/_parser.py +++ b/rdata/parser/_parser.py @@ -36,6 +36,11 @@ R_FLOAT_NA: Final = np.frombuffer(b"\x7f\xf0\x00\x00\x00\x00\x07\xa2", dtype=">f8").astype("=f8")[0] # noqa: E501 +def is_float_na(value: float) -> bool: + """Check if value is NA value.""" + return np.array(value).tobytes() == np.array(R_FLOAT_NA).tobytes() + + @runtime_checkable class BinaryFileLike(Protocol): """Protocol for binary files.""" diff --git a/rdata/unparser/_ascii.py b/rdata/unparser/_ascii.py index a8bdc78..4ea9863 100644 --- a/rdata/unparser/_ascii.py +++ b/rdata/unparser/_ascii.py @@ -7,7 +7,7 @@ import numpy as np -from rdata.parser import R_FLOAT_NA +from rdata.parser import is_float_na from ._unparser import Unparser @@ -17,11 +17,6 @@ import numpy.typing as npt -def is_float_na(value: float) -> bool: - """Check if value is NA value.""" - return np.array(value).tobytes() == np.array(R_FLOAT_NA).tobytes() - - class UnparserASCII(Unparser): """Unparser for files in ASCII format.""" From 3dc01696a6c936b30ec8b79aa29bc0d3fa478298 Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Fri, 13 Sep 2024 16:39:54 +0300 Subject: [PATCH 041/100] Fix mypy --- rdata/conversion/to_r.py | 15 +++++++++------ rdata/parser/_parser.py | 4 ++-- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/rdata/conversion/to_r.py b/rdata/conversion/to_r.py index 9de4417..c95dd7c 100644 --- a/rdata/conversion/to_r.py +++ b/rdata/conversion/to_r.py @@ -74,15 +74,17 @@ def convert_pd_array_to_np_array( if isinstance(pd_array, ( pd.arrays.BooleanArray, pd.arrays.IntegerArray, - pd.arrays.FloatingArray, + pd.arrays.FloatingArray, # type: ignore [attr-defined] )): + dtype: type[Any] + fill_value: bool | int | float if isinstance(pd_array, pd.arrays.BooleanArray): dtype = np.bool_ fill_value = True elif isinstance(pd_array, pd.arrays.IntegerArray): dtype = np.int32 fill_value = R_INT_NA - elif isinstance(pd_array, pd.arrays.FloatingArray): + elif isinstance(pd_array, pd.arrays.FloatingArray): # type: ignore [attr-defined] dtype = np.float64 fill_value = R_FLOAT_NA @@ -91,10 +93,10 @@ def convert_pd_array_to_np_array( data = np.empty(pd_array.shape, dtype=dtype) data[~mask] = pd_array[~mask].to_numpy() data[mask] = fill_value - if isinstance(pd_array, pd.arrays.FloatingArray): + if isinstance(pd_array, pd.arrays.FloatingArray): # type: ignore [attr-defined] array = data else: - array = np.ma.array( + array = np.ma.array( # type: ignore [no-untyped-call] data=data, mask=mask, fill_value=fill_value, @@ -105,7 +107,7 @@ def convert_pd_array_to_np_array( return array if isinstance(pd_array, ( - pd.arrays.NumpyExtensionArray, # type: ignore [attr-defined] + pd.arrays.NumpyExtensionArray, )): return pd_array.to_numpy() @@ -196,7 +198,7 @@ def __init__(self, *, # In test files the order in which dataframe attributes are written varies. # R can read files with attributes in any order, but this variable # is used in tests to change the attribute order to match with the test file. - self.df_attr_order = None + self.df_attr_order: list[str] | None = None def build_r_data(self, @@ -491,6 +493,7 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 column_names.append(column) pd_array = series.array + array: pd.Categorical | npt.NDArray[Any] if isinstance(pd_array, pd.Categorical): array = pd_array else: diff --git a/rdata/parser/_parser.py b/rdata/parser/_parser.py index 929f1ac..aaa0123 100644 --- a/rdata/parser/_parser.py +++ b/rdata/parser/_parser.py @@ -29,11 +29,11 @@ #: Value used to represent a missing integer in R. -R_INT_NA: Final = -2**31 +R_INT_NA: Final[int] = -2**31 #: Value used to represent a missing float in R. # This is a NaN with a particular payload, but it's not the same as np.nan. -R_FLOAT_NA: Final = np.frombuffer(b"\x7f\xf0\x00\x00\x00\x00\x07\xa2", dtype=">f8").astype("=f8")[0] # noqa: E501 +R_FLOAT_NA: Final[float] = np.frombuffer(b"\x7f\xf0\x00\x00\x00\x00\x07\xa2", dtype=">f8").astype("=f8")[0] # noqa: E501 def is_float_na(value: float) -> bool: From d4049ba14c516b81ca802d52f1f23ab79e4b0237 Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Fri, 13 Sep 2024 16:47:05 +0300 Subject: [PATCH 042/100] Simplify reference lists --- rdata/conversion/to_r.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/rdata/conversion/to_r.py b/rdata/conversion/to_r.py index c95dd7c..201de14 100644 --- a/rdata/conversion/to_r.py +++ b/rdata/conversion/to_r.py @@ -192,8 +192,8 @@ def __init__(self, *, self.encoding = encoding self.format_version = format_version self.r_version_serialized = r_version_serialized - self.reference_name_list: list[None | str] = [None] - self.reference_obj_list: list[None | RObject] = [None] + self._references: dict[str | None, tuple[int, RObject | None]] \ + = {None: (0, None)} # In test files the order in which dataframe attributes are written varies. # R can read files with attributes in any order, but this variable @@ -290,18 +290,16 @@ def build_r_sym(self, R object. """ # Reference to existing symbol if exists - if name in self.reference_name_list: - idx = self.reference_name_list.index(name) - obj = self.reference_obj_list[idx] - return build_r_object(RObjectType.REF, reference=(idx, obj)) + if name in self._references: + reference = self._references[name] + return build_r_object(RObjectType.REF, reference=reference) # Create a new symbol r_value = self.convert_to_r_object(name.encode(self.encoding)) r_object = build_r_object(RObjectType.SYM, value=r_value) # Add to reference list - self.reference_name_list.append(name) - self.reference_obj_list.append(r_object) + self._references[name] = (len(self._references), r_object) return r_object From cb3c487576135ee1dc750c9e7c9d7ba3eebb1f8a Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Fri, 13 Sep 2024 17:40:08 +0300 Subject: [PATCH 043/100] Simplify creation of R lists --- rdata/conversion/to_r.py | 119 ++++++++++++++++++--------------------- 1 file changed, 56 insertions(+), 63 deletions(-) diff --git a/rdata/conversion/to_r.py b/rdata/conversion/to_r.py index 201de14..6e0ec31 100644 --- a/rdata/conversion/to_r.py +++ b/rdata/conversion/to_r.py @@ -28,21 +28,13 @@ if TYPE_CHECKING: from collections.abc import Mapping - from typing import Any, Final, Literal, Protocol + from typing import Any, Final, Literal import numpy.typing as npt Encoding = Literal["utf-8", "cp1252"] - class Converter(Protocol): - """Protocol for Py-to-R conversion.""" - - def __call__(self, data: Any) -> RObject: # noqa: ANN401 - """Convert Python object to R object.""" - - - # Default values for RVersions object DEFAULT_FORMAT_VERSION: Final[int] = 3 DEFAULT_R_VERSION_SERIALIZED: Final[int] = 0x40201 @@ -166,6 +158,34 @@ def build_r_object( ) +def build_r_list( + data: list[RObject] | list[tuple[RObject, RObject]], +) -> RObject: + """ + Build R object representing (named) linked list. + + Args: + data: Non-empty list of values or (key, value) pairs. + + Returns: + R object. + """ + if len(data) == 0: + msg = "data must not be empty" + raise ValueError(msg) + + head = data[0] + tail = data[1:] + if isinstance(head, tuple): + tag, car = head + else: + tag = None + car = head + + cdr = build_r_object(RObjectType.NILVALUE) if len(tail) == 0 else build_r_list(tail) + + return build_r_object(RObjectType.LIST, value=(car, cdr), tag=tag) + class ConverterFromPythonToR: """ @@ -229,52 +249,23 @@ def build_r_data(self, return RData(versions, extra, r_object) - def build_r_list(self, - data: Mapping[str, Any] | list[Any], - *, - convert_value: Converter | None = None, + def convert_to_r_attributes(self, + data: dict[str, Any], ) -> RObject: """ - Build R object representing named linked list. + Convert dictionary to R attributes list. Args: - data: Non-empty dictionary or list. - convert_value: Function used for converting value to R object - (for example, convert_to_r_object). + data: Non-empty dictionary. Returns: R object. """ - if convert_value is None: - convert_value = self.convert_to_r_object - - if len(data) == 0: - msg = "data must not be empty" - raise ValueError(msg) - - if isinstance(data, dict): - data = data.copy() - key = next(iter(data)) - tag = self.build_r_sym(key) - car = data.pop(key) - elif isinstance(data, list): - car = data[0] - data = data[1:] - tag = None - - if not isinstance(car, RObject): - car = convert_value(car) - - if len(data) == 0: - cdr = build_r_object(RObjectType.NILVALUE) - else: - cdr = self.build_r_list(data, convert_value=convert_value) + converted = [] + for key, value in data.items(): + converted.append((self.build_r_sym(key), self.convert_to_r_object(value))) - return build_r_object( - RObjectType.LIST, - value=(car, cdr), - tag=tag, - ) + return build_r_list(converted) def build_r_sym(self, @@ -321,7 +312,7 @@ def convert_to_r_object_for_rda(self, if not isinstance(data, dict): msg = f"for RDA file, data must be a dictionary, not type {type(data)}" raise TypeError(msg) - return self.build_r_list(data) + return self.convert_to_r_attributes(data) def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 @@ -341,7 +332,7 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 values: list[Any] | tuple[Any, ...] r_value: Any = None is_object = False - attributes = None + attributes: dict[str, Any] | None = None tag = None gp = 0 @@ -354,14 +345,12 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 elif isinstance(data, RLanguage): r_type = RObjectType.LANG - values = data.elements - r_value = (self.build_r_sym(str(values[0])), - self.build_r_list(values[1:], - convert_value=self.build_r_sym)) # type: ignore [arg-type] + symbols = [self.build_r_sym(el) for el in data.elements] + r_value = (symbols[0], build_r_list(symbols[1:])) if len(data.attributes) > 0: # The following might work here (untested) - # attributes = build_r_list(data.attributes) # noqa: ERA001 + # attributes = data.attributes # noqa: ERA001 msg = f"type {r_type} with attributes not implemented" raise NotImplementedError(msg) @@ -372,7 +361,7 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 if isinstance(data, dict): names = np.array(list(data.keys()), dtype=np.dtype("U")) - attributes = self.build_r_list({"names": names}) + attributes = {"names": names} elif isinstance(data, np.ndarray): if data.dtype.kind in ["O"]: @@ -415,7 +404,7 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 else: # R uses column-major order like Fortran r_value = np.ravel(data, order="F") - attributes = self.build_r_list({"dim": np.array(data.shape)}) + attributes = {"dim": np.array(data.shape)} elif isinstance(data, (bool, int, float, complex)): return self.convert_to_r_object(np.array(data)) @@ -455,10 +444,10 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 r_type = RObjectType.ALTREP r_value = ( - self.build_r_list([ + build_r_list([ self.build_r_sym("compact_intseq"), self.build_r_sym("base"), - RObjectType.INT.value, + self.convert_to_r_object(RObjectType.INT.value), ]), self.convert_to_r_object(np.array([ len(data), @@ -476,10 +465,10 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 is_object = True r_type = RObjectType.INT r_value = data.codes + 1 - attributes = self.build_r_list({ + attributes = { "levels": data.categories.to_numpy(), "class": "factor", - }) + } elif isinstance(data, pd.DataFrame): is_object = True @@ -524,20 +513,24 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 msg = f"pd.DataFrame index {type(index)} not implemented" raise NotImplementedError(msg) - attr_dict = { + attributes = { "names": np.array(column_names, dtype=np.dtype("U")), "row.names": row_names, "class": "data.frame", } if self.df_attr_order is not None: - attr_dict = {k: attr_dict[k] for k in self.df_attr_order} - attributes = self.build_r_list(attr_dict) + attributes = {k: attributes[k] for k in self.df_attr_order} else: msg = f"type {type(data)} not implemented" raise NotImplementedError(msg) + if attributes is not None: + r_attributes = self.convert_to_r_attributes(attributes) + else: + r_attributes = None + return build_r_object(r_type, value=r_value, is_object=is_object, - attributes=attributes, + attributes=r_attributes, tag=tag, gp=gp) From 76bdc83091a7b063acffc1d3273ab59fc7458a4e Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Fri, 13 Sep 2024 17:57:55 +0300 Subject: [PATCH 044/100] Rename build_r_sym() to convert_to_r_sym() --- rdata/conversion/to_r.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/rdata/conversion/to_r.py b/rdata/conversion/to_r.py index 6e0ec31..f47f132 100644 --- a/rdata/conversion/to_r.py +++ b/rdata/conversion/to_r.py @@ -263,16 +263,19 @@ def convert_to_r_attributes(self, """ converted = [] for key, value in data.items(): - converted.append((self.build_r_sym(key), self.convert_to_r_object(value))) + converted.append(( + self.convert_to_r_sym(key), + self.convert_to_r_object(value), + )) return build_r_list(converted) - def build_r_sym(self, + def convert_to_r_sym(self, name: str, ) -> RObject: """ - Build R object representing symbol. + Convert string to R symbol. Args: name: String. @@ -345,7 +348,7 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 elif isinstance(data, RLanguage): r_type = RObjectType.LANG - symbols = [self.build_r_sym(el) for el in data.elements] + symbols = [self.convert_to_r_sym(el) for el in data.elements] r_value = (symbols[0], build_r_list(symbols[1:])) if len(data.attributes) > 0: @@ -445,8 +448,8 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 r_type = RObjectType.ALTREP r_value = ( build_r_list([ - self.build_r_sym("compact_intseq"), - self.build_r_sym("base"), + self.convert_to_r_sym("compact_intseq"), + self.convert_to_r_sym("base"), self.convert_to_r_object(RObjectType.INT.value), ]), self.convert_to_r_object(np.array([ From 87194d4674e087b0b62172d65d92824a7423f2f2 Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Fri, 13 Sep 2024 17:58:52 +0300 Subject: [PATCH 045/100] Simplify creation of RData object --- rdata/_write.py | 6 ++---- rdata/conversion/to_r.py | 42 ++++++++++++++++----------------------- rdata/tests/test_write.py | 17 +++++----------- 3 files changed, 24 insertions(+), 41 deletions(-) diff --git a/rdata/_write.py b/rdata/_write.py index a1fd162..c56e274 100644 --- a/rdata/_write.py +++ b/rdata/_write.py @@ -53,8 +53,7 @@ def write_rds( encoding=encoding, format_version=format_version, ) - r_object = converter.convert_to_r_object(data) - r_data = converter.build_r_data(r_object) + r_data = converter.convert_to_r_data(data) unparse_file( path, @@ -103,8 +102,7 @@ def write_rda( encoding=encoding, format_version=format_version, ) - r_object = converter.convert_to_r_object_for_rda(data) - r_data = converter.build_r_data(r_object) + r_data = converter.convert_to_r_data(data, file_type="rda") unparse_file( path, diff --git a/rdata/conversion/to_r.py b/rdata/conversion/to_r.py index f47f132..785e311 100644 --- a/rdata/conversion/to_r.py +++ b/rdata/conversion/to_r.py @@ -32,6 +32,8 @@ import numpy.typing as npt + from rdata.unparser import FileType + Encoding = Literal["utf-8", "cp1252"] @@ -221,14 +223,17 @@ def __init__(self, *, self.df_attr_order: list[str] | None = None - def build_r_data(self, - r_object: RObject, + def convert_to_r_data(self, + data: Any, # noqa: ANN401 + *, + file_type: FileType = "rds", ) -> RData: """ - Build RData object from R object. + Convert Python data to R data. Args: - r_object: R object. + data: Any Python object. + file_type: File type. Returns: Corresponding RData object. @@ -236,6 +241,14 @@ def build_r_data(self, See Also: convert_to_r_object """ + if file_type == "rda": + if not isinstance(data, dict): + msg = f"for RDA file, data must be a dictionary, not type {type(data)}" + raise TypeError(msg) + r_object = self.convert_to_r_attributes(data) + else: + r_object = self.convert_to_r_object(data) + versions = RVersions( self.format_version, self.r_version_serialized, @@ -297,27 +310,6 @@ def convert_to_r_sym(self, return r_object - def convert_to_r_object_for_rda(self, - data: Mapping[str, Any], - ) -> RObject: - """ - Convert Python dictionary to R object for RDA file. - - Args: - data: Python dictionary with data and variable names. - - Returns: - Corresponding R object. - - See Also: - convert_to_r_object - """ - if not isinstance(data, dict): - msg = f"for RDA file, data must be a dictionary, not type {type(data)}" - raise TypeError(msg) - return self.convert_to_r_attributes(data) - - def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 data: Any, # noqa: ANN401 ) -> RObject: diff --git a/rdata/tests/test_write.py b/rdata/tests/test_write.py index 86cca4d..68ea003 100644 --- a/rdata/tests/test_write.py +++ b/rdata/tests/test_write.py @@ -145,15 +145,10 @@ def test_convert_to_r(fname: str, expand_altrep: bool) -> None: # noqa: FBT001 converter.df_attr_order = ["names", "class", "row.names"] try: - if file_type == "rds": - r_obj = converter.convert_to_r_object(py_data) - else: - r_obj = converter.convert_to_r_object_for_rda(py_data) + new_r_data = converter.convert_to_r_data(py_data, file_type=file_type) except NotImplementedError as e: pytest.xfail(str(e)) - new_r_data = converter.build_r_data(r_obj) - assert str(r_data) == str(new_r_data) assert r_data == new_r_data @@ -175,7 +170,7 @@ def test_convert_to_r_bad_rda() -> None: py_data = "hello" converter = ConverterFromPythonToR() with pytest.raises(TypeError, match="(?i)data must be a dictionary"): - converter.convert_to_r_object_for_rda(py_data) # type: ignore [arg-type] + converter.convert_to_r_data(py_data, file_type="rda") def test_convert_to_r_empty_rda() -> None: @@ -183,15 +178,14 @@ def test_convert_to_r_empty_rda() -> None: py_data: dict[str, Any] = {} converter = ConverterFromPythonToR() with pytest.raises(ValueError, match="(?i)data must not be empty"): - converter.convert_to_r_object_for_rda(py_data) + converter.convert_to_r_data(py_data, file_type="rda") def test_unparse_bad_rda() -> None: """Test checking that data for RDA has variable names.""" py_data = "hello" converter = ConverterFromPythonToR() - r_obj = converter.convert_to_r_object(py_data) - r_data = converter.build_r_data(r_obj) + r_data = converter.convert_to_r_data(py_data) with pytest.raises(ValueError, match="(?i)must be dictionary-like"): unparse_data(r_data, file_type="rda") @@ -214,8 +208,7 @@ def test_unparse_big_int() -> None: """Test checking too large integers.""" big_int = 2**32 converter = ConverterFromPythonToR() - r_obj = converter.convert_to_r_object(big_int) - r_data = converter.build_r_data(r_obj) + r_data = converter.convert_to_r_data(big_int) with pytest.raises(ValueError, match="(?i)not castable"): unparse_data(r_data, file_format="xdr") From d9c3be58eb9c6b015b5704a741df7f5958c358ad Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Fri, 13 Sep 2024 18:14:08 +0300 Subject: [PATCH 046/100] Add helper functions for conversion --- rdata/_write.py | 11 ++++--- rdata/conversion/__init__.py | 2 ++ rdata/conversion/to_r.py | 60 ++++++++++++++++++++++++++++++++++++ 3 files changed, 68 insertions(+), 5 deletions(-) diff --git a/rdata/_write.py b/rdata/_write.py index c56e274..b1a42e1 100644 --- a/rdata/_write.py +++ b/rdata/_write.py @@ -3,7 +3,7 @@ from typing import TYPE_CHECKING -from .conversion import ConverterFromPythonToR +from .conversion import convert_python_to_r_data from .conversion.to_r import DEFAULT_FORMAT_VERSION from .unparser import unparse_file @@ -49,11 +49,11 @@ def write_rds( >>> data = ["hello", 1, 2.2, 3.3+4.4j] >>> rdata.write_rds("test.rds", data) """ - converter = ConverterFromPythonToR( + r_data = convert_python_to_r_data( + data, encoding=encoding, format_version=format_version, ) - r_data = converter.convert_to_r_data(data) unparse_file( path, @@ -98,11 +98,12 @@ def write_rda( >>> data = {"name": "hello", "values": [1, 2.2, 3.3+4.4j]} >>> rdata.write_rda("test.rda", data) """ - converter = ConverterFromPythonToR( + r_data = convert_python_to_r_data( + data, encoding=encoding, format_version=format_version, + file_type="rda", ) - r_data = converter.convert_to_r_data(data, file_type="rda") unparse_file( path, diff --git a/rdata/conversion/__init__.py b/rdata/conversion/__init__.py index e802758..55506b3 100644 --- a/rdata/conversion/__init__.py +++ b/rdata/conversion/__init__.py @@ -26,4 +26,6 @@ ) from .to_r import ( ConverterFromPythonToR as ConverterFromPythonToR, + convert_python_to_r_data as convert_python_to_r_data, + convert_python_to_r_object as convert_python_to_r_object, ) diff --git a/rdata/conversion/to_r.py b/rdata/conversion/to_r.py index 785e311..7ac4591 100644 --- a/rdata/conversion/to_r.py +++ b/rdata/conversion/to_r.py @@ -529,3 +529,63 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 is_object=is_object, attributes=r_attributes, tag=tag, gp=gp) + + +def convert_python_to_r_data( + data: Any, # noqa: ANN401 + *, + encoding: Encoding = "utf-8", + format_version: int = DEFAULT_FORMAT_VERSION, + r_version_serialized: int = DEFAULT_R_VERSION_SERIALIZED, + file_type: FileType = "rds", +) -> RData: + """ + Convert Python data to R data. + + Args: + data: Any Python object. + encoding: Encoding to be used for strings within data. + format_version: File format version. + r_version_serialized: R version written as the creator of the object. + file_type: File type. + + Returns: + Corresponding RData object. + + See Also: + convert_python_to_r_object + """ + return ConverterFromPythonToR( + encoding=encoding, + format_version=format_version, + r_version_serialized=r_version_serialized, + ).convert_to_r_data(data, file_type=file_type) + + +def convert_python_to_r_object( + data: Any, # noqa: ANN401 + *, + encoding: Encoding = "utf-8", + format_version: int = DEFAULT_FORMAT_VERSION, + r_version_serialized: int = DEFAULT_R_VERSION_SERIALIZED, +) -> RObject: + """ + Convert Python data to R object. + + Args: + data: Any Python object. + encoding: Encoding to be used for strings within data. + format_version: File format version. + r_version_serialized: R version written as the creator of the object. + + Returns: + Corresponding RObject object. + + See Also: + convert_python_to_r_data + """ + return ConverterFromPythonToR( + encoding=encoding, + format_version=format_version, + r_version_serialized=r_version_serialized, + ).convert_to_r_object(data) From 22fe43d9e057139bdf035284c29a9746735e61aa Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Mon, 16 Sep 2024 08:19:24 +0300 Subject: [PATCH 047/100] Clarify NA values --- rdata/tests/test_rdata.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/rdata/tests/test_rdata.py b/rdata/tests/test_rdata.py index 9e4d663..579ab4f 100644 --- a/rdata/tests/test_rdata.py +++ b/rdata/tests/test_rdata.py @@ -581,19 +581,19 @@ def test_dataframe_dtypes_with_na(self) -> None: ref = pd.DataFrame( { "int": pd.Series( - [10, 20, 30, None], + [10, 20, 30, pd.NA], dtype=pd.Int32Dtype(), index=index), "float": pd.Series( - [1.1, 2.2, 3.3, None], + [1.1, 2.2, 3.3, pd.NA], dtype=pd.Float64Dtype(), index=index), "string": pd.Series( - ["x" ,"y", "z", None], + ["x" ,"y", "z", pd.NA], dtype=pd.StringDtype(), index=index), "bool": pd.Series( - [True, False, True, None], + [True, False, True, pd.NA], dtype=pd.BooleanDtype(), index=index), "complex": pd.Series( - [4+5j, 6+7j, 8+9j, None], + [4+5j, 6+7j, 8+9j, rdata.parser.R_FLOAT_NA], dtype=complex, index=index), }, index=index, From 3bd285a52f301471c9d95ec24a1feda2ead43699 Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Mon, 16 Sep 2024 08:19:43 +0300 Subject: [PATCH 048/100] Filter expected warnings --- rdata/tests/test_rdata.py | 5 ++++- rdata/tests/test_write.py | 7 +++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/rdata/tests/test_rdata.py b/rdata/tests/test_rdata.py index 579ab4f..41534ec 100644 --- a/rdata/tests/test_rdata.py +++ b/rdata/tests/test_rdata.py @@ -598,7 +598,10 @@ def test_dataframe_dtypes_with_na(self) -> None: }, index=index, ) - pd.testing.assert_frame_equal(data, ref) + + with np.errstate(invalid="ignore"): + # Comparing complex arrays with R_FLOAT_NA gives warning + pd.testing.assert_frame_equal(data, ref) def test_ts(self) -> None: """Test time series conversion.""" diff --git a/rdata/tests/test_write.py b/rdata/tests/test_write.py index 68ea003..6d2db28 100644 --- a/rdata/tests/test_write.py +++ b/rdata/tests/test_write.py @@ -85,7 +85,8 @@ def test_unparse(fname: str) -> None: with (TESTDATA_PATH / fname).open("rb") as f: data = decompress_data(f.read()) file_type, file_format = parse_file_type_and_format(data) - r_data = rdata.parser.parse_data(data, expand_altrep=False) + r_data = rdata.parser.parse_data( + data, expand_altrep=False, extension=f".{file_type}") try: out_data = unparse_data( @@ -99,6 +100,7 @@ def test_unparse(fname: str) -> None: assert data == out_data +@pytest.mark.filterwarnings("ignore:Missing constructor") @pytest.mark.parametrize("fname", fnames, ids=fnames) @pytest.mark.parametrize("expand_altrep", [True, False]) def test_convert_to_r(fname: str, expand_altrep: bool) -> None: # noqa: FBT001 @@ -116,7 +118,8 @@ def test_convert_to_r(fname: str, expand_altrep: bool) -> None: # noqa: FBT001 data = decompress_data(f.read()) file_type, file_format = parse_file_type_and_format(data) - r_data = rdata.parser.parse_data(data, expand_altrep=expand_altrep) + r_data = rdata.parser.parse_data( + data, expand_altrep=expand_altrep, extension=f".{file_type}") try: py_data = rdata.conversion.convert(r_data) From 80a0c9be7d5b3f80427af65aeceec582982c1f79 Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Mon, 16 Sep 2024 09:18:45 +0300 Subject: [PATCH 049/100] Add test for dataframe with NA and NaN floats --- .../data/test_dataframe_float_with_na_nan.rds | Bin 0 -> 147 bytes rdata/tests/test_rdata.py | 17 +++++++++++++++++ rdata/tests/test_write.py | 1 + 3 files changed, 18 insertions(+) create mode 100644 rdata/tests/data/test_dataframe_float_with_na_nan.rds diff --git a/rdata/tests/data/test_dataframe_float_with_na_nan.rds b/rdata/tests/data/test_dataframe_float_with_na_nan.rds new file mode 100644 index 0000000000000000000000000000000000000000..fed00f4d1da15f2a404f9b2054696038d3559b71 GIT binary patch literal 147 zcmb2|=3oE==I#ec2?+^l35kr8);Op!XJ>TGUdK8?o#%-GlcBu8=G*miIGHxfNGvo7 zmy)^XAhN|J&zL!^POy8T9$*wiAw zB*R@^>!uu; None: # Comparing complex arrays with R_FLOAT_NA gives warning pd.testing.assert_frame_equal(data, ref) + def test_dataframe_float_with_na_nan(self) -> None: + """Test dataframe conversion.""" + # File created in R with + # df = data.frame(float=c(1.1, 2.2, 3.3, NA, NaN, Inf, -Inf)); saveRDS(df, file="test_dataframe_float_with_na_nan.rds") # noqa: E501,ERA001 + data = rdata.read_rds(TESTDATA_PATH / "test_dataframe_float_with_na_nan.rds") + + index = pd.RangeIndex(1, 8) + ref = pd.DataFrame( + { + "float": pd.Series( + [1.1, 2.2, 3.3, rdata.parser.R_FLOAT_NA, np.nan, np.inf, -np.inf], + dtype=float, index=index), + }, + index=index, + ) + pd.testing.assert_frame_equal(data, ref) + def test_ts(self) -> None: """Test time series conversion.""" data = rdata.read_rda(TESTDATA_PATH / "test_ts.rda") diff --git a/rdata/tests/test_write.py b/rdata/tests/test_write.py index 6d2db28..3e9b5ec 100644 --- a/rdata/tests/test_write.py +++ b/rdata/tests/test_write.py @@ -144,6 +144,7 @@ def test_convert_to_r(fname: str, expand_altrep: bool) -> None: # noqa: FBT001 "test_dataframe_range_rownames.rds", "test_dataframe_rownames.rda", "test_dataframe_dtypes_with_na.rds", + "test_dataframe_float_with_na_nan.rds", ]: converter.df_attr_order = ["names", "class", "row.names"] From b73d4bd5e0652a9f8b17cef740060b021a853029 Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Mon, 16 Sep 2024 09:20:25 +0300 Subject: [PATCH 050/100] Do not use pandas floating array --- rdata/conversion/_conversion.py | 8 ++++++- rdata/conversion/to_r.py | 39 ++++++++++++++++++--------------- rdata/tests/test_rdata.py | 10 ++++----- 3 files changed, 33 insertions(+), 24 deletions(-) diff --git a/rdata/conversion/_conversion.py b/rdata/conversion/_conversion.py index 24c5022..8e492e3 100644 --- a/rdata/conversion/_conversion.py +++ b/rdata/conversion/_conversion.py @@ -449,7 +449,13 @@ def _dataframe_column_transform(source: Any) -> Any: # noqa: ANN401 if np.issubdtype(source.dtype, np.integer): dtype = pd.Int32Dtype() elif np.issubdtype(source.dtype, np.floating): - dtype = pd.Float64Dtype() + # We return the numpy array here, which keeps + # R_FLOAT_NA, np.nan, and other NaNs as they were originally in the file. + # Users can then decide if they prefer to interpret + # only R_FLOAT_NA or all NaNs as "missing". + return source + # This would create an array with all NaNs as "missing": + # dtype = pd.Float64Dtype() # noqa: ERA001 elif np.issubdtype(source.dtype, np.complexfloating): # There seems to be no pandas type for complex array return source diff --git a/rdata/conversion/to_r.py b/rdata/conversion/to_r.py index 7ac4591..f09ba17 100644 --- a/rdata/conversion/to_r.py +++ b/rdata/conversion/to_r.py @@ -68,42 +68,45 @@ def convert_pd_array_to_np_array( if isinstance(pd_array, ( pd.arrays.BooleanArray, pd.arrays.IntegerArray, - pd.arrays.FloatingArray, # type: ignore [attr-defined] )): dtype: type[Any] - fill_value: bool | int | float + fill_value: bool | int if isinstance(pd_array, pd.arrays.BooleanArray): dtype = np.bool_ fill_value = True elif isinstance(pd_array, pd.arrays.IntegerArray): dtype = np.int32 fill_value = R_INT_NA - elif isinstance(pd_array, pd.arrays.FloatingArray): # type: ignore [attr-defined] - dtype = np.float64 - fill_value = R_FLOAT_NA mask = pd_array.isna() if np.any(mask): - data = np.empty(pd_array.shape, dtype=dtype) - data[~mask] = pd_array[~mask].to_numpy() - data[mask] = fill_value - if isinstance(pd_array, pd.arrays.FloatingArray): # type: ignore [attr-defined] - array = data - else: - array = np.ma.array( # type: ignore [no-untyped-call] - data=data, - mask=mask, - fill_value=fill_value, - ) + data = pd_array.to_numpy(dtype=dtype, na_value=fill_value) + array = np.ma.array( # type: ignore [no-untyped-call] + data=data, + mask=mask, + fill_value=fill_value, + ) else: array = pd_array.to_numpy() assert array.dtype == dtype + assert isinstance(array, np.ndarray) # for mypy return array if isinstance(pd_array, ( - pd.arrays.NumpyExtensionArray, + pd.arrays.FloatingArray, # type: ignore [attr-defined] )): - return pd_array.to_numpy() + # Note that this possibly maps all NaNs (not only R_FLOAT_NA) + # to the same `na_value` depending on how the array was built: + array = pd_array.to_numpy(dtype=np.float64, na_value=R_FLOAT_NA) + assert isinstance(array, np.ndarray) # for mypy + return array + + if isinstance(pd_array, ( + pd.arrays.NumpyExtensionArray, # type: ignore [attr-defined] + )): + array = pd_array.to_numpy() + assert isinstance(array, np.ndarray) # for mypy + return array msg = f"pandas array {type(array)} not implemented" raise NotImplementedError(msg) diff --git a/rdata/tests/test_rdata.py b/rdata/tests/test_rdata.py index 1a0f982..d3c0a89 100644 --- a/rdata/tests/test_rdata.py +++ b/rdata/tests/test_rdata.py @@ -519,7 +519,7 @@ def test_dataframe_int_rownames(self) -> None: { "col1": pd.Series( [10., 20., 30.], - dtype=pd.Float64Dtype(), index=index), + dtype=float, index=index), }, index=index, ) @@ -536,7 +536,7 @@ def test_dataframe_range_rownames(self) -> None: { "col1": pd.Series( [10., 20., 30.], - dtype=pd.Float64Dtype(), index=index), + dtype=float, index=index), }, index=index, ) @@ -556,7 +556,7 @@ def test_dataframe_dtypes(self) -> None: dtype=pd.Int32Dtype(), index=index), "float": pd.Series( [1.1, 2.2, 3.3], - dtype=pd.Float64Dtype(), index=index), + dtype=float, index=index), "string": pd.Series( ["x" ,"y", "z"], dtype=pd.StringDtype(), index=index), @@ -584,8 +584,8 @@ def test_dataframe_dtypes_with_na(self) -> None: [10, 20, 30, pd.NA], dtype=pd.Int32Dtype(), index=index), "float": pd.Series( - [1.1, 2.2, 3.3, pd.NA], - dtype=pd.Float64Dtype(), index=index), + [1.1, 2.2, 3.3, rdata.parser.R_FLOAT_NA], + dtype=float, index=index), "string": pd.Series( ["x" ,"y", "z", pd.NA], dtype=pd.StringDtype(), index=index), From e88419e636149c80b7b94aab76f317143df02890 Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Mon, 16 Sep 2024 10:33:09 +0300 Subject: [PATCH 051/100] Remove unused R_INT_MIN --- rdata/conversion/_conversion.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/rdata/conversion/_conversion.py b/rdata/conversion/_conversion.py index 8e492e3..57644c6 100644 --- a/rdata/conversion/_conversion.py +++ b/rdata/conversion/_conversion.py @@ -439,9 +439,6 @@ def convert_altrep_to_range( return range(start, stop + 1, step) -R_INT_MIN = -2**31 - - def _dataframe_column_transform(source: Any) -> Any: # noqa: ANN401 if isinstance(source, np.ndarray): From 6bb5d5bc9d0a77fc7cb26799e7ad4c9aafa55fc3 Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Mon, 16 Sep 2024 10:35:19 +0300 Subject: [PATCH 052/100] Change dataframe default attribute order --- rdata/conversion/to_r.py | 2 +- rdata/tests/test_write.py | 12 +++++------- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/rdata/conversion/to_r.py b/rdata/conversion/to_r.py index f09ba17..f232316 100644 --- a/rdata/conversion/to_r.py +++ b/rdata/conversion/to_r.py @@ -513,8 +513,8 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 attributes = { "names": np.array(column_names, dtype=np.dtype("U")), - "row.names": row_names, "class": "data.frame", + "row.names": row_names, } if self.df_attr_order is not None: attributes = {k: attributes[k] for k in self.df_attr_order} diff --git a/rdata/tests/test_write.py b/rdata/tests/test_write.py index 3e9b5ec..edef9e7 100644 --- a/rdata/tests/test_write.py +++ b/rdata/tests/test_write.py @@ -139,14 +139,12 @@ def test_convert_to_r(fname: str, expand_altrep: bool) -> None: # noqa: FBT001 r_version_serialized=r_data.versions.serialized, ) if fname in [ - "test_dataframe_dtypes.rds", - "test_dataframe_int_rownames.rds", - "test_dataframe_range_rownames.rds", - "test_dataframe_rownames.rda", - "test_dataframe_dtypes_with_na.rds", - "test_dataframe_float_with_na_nan.rds", + "test_dataframe.rda", + "test_dataframe.rds", + "test_dataframe_v3.rda", + "test_dataframe_v3.rds", ]: - converter.df_attr_order = ["names", "class", "row.names"] + converter.df_attr_order = ["names", "row.names", "class"] try: new_r_data = converter.convert_to_r_data(py_data, file_type=file_type) From fd813e2224886086364c989c1de47e6cd60a0ad9 Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Mon, 16 Sep 2024 10:42:11 +0300 Subject: [PATCH 053/100] Move NA values and related functions to a new file --- rdata/conversion/to_r.py | 3 +-- rdata/missing.py | 17 +++++++++++++++++ rdata/parser/__init__.py | 3 --- rdata/parser/_ascii.py | 4 +++- rdata/parser/_parser.py | 15 ++------------- rdata/tests/test_rdata.py | 7 ++++--- rdata/unparser/_ascii.py | 2 +- rdata/unparser/_xdr.py | 2 +- 8 files changed, 29 insertions(+), 24 deletions(-) create mode 100644 rdata/missing.py diff --git a/rdata/conversion/to_r.py b/rdata/conversion/to_r.py index f232316..a2add50 100644 --- a/rdata/conversion/to_r.py +++ b/rdata/conversion/to_r.py @@ -9,9 +9,8 @@ import numpy as np import pandas as pd +from rdata.missing import R_FLOAT_NA, R_INT_NA from rdata.parser import ( - R_FLOAT_NA, - R_INT_NA, CharFlags, RData, RExtraInfo, diff --git a/rdata/missing.py b/rdata/missing.py new file mode 100644 index 0000000..0a931b4 --- /dev/null +++ b/rdata/missing.py @@ -0,0 +1,17 @@ +"""Utilities for missing (NA) values in R.""" + +from typing import Final + +import numpy as np + +#: Value used to represent a missing integer in R. +R_INT_NA: Final[int] = -2**31 + +#: Value used to represent a missing float in R. +# This is a NaN with a particular payload, but it's not the same as np.nan. +R_FLOAT_NA: Final[float] = np.frombuffer(b"\x7f\xf0\x00\x00\x00\x00\x07\xa2", dtype=">f8").astype("=f8")[0] # noqa: E501 + + +def is_float_na(value: float) -> bool: + """Check if value is NA value.""" + return np.array(value).tobytes() == np.array(R_FLOAT_NA).tobytes() diff --git a/rdata/parser/__init__.py b/rdata/parser/__init__.py index 98375fe..48421e6 100644 --- a/rdata/parser/__init__.py +++ b/rdata/parser/__init__.py @@ -2,8 +2,6 @@ from ._parser import ( DEFAULT_ALTREP_MAP as DEFAULT_ALTREP_MAP, - R_FLOAT_NA as R_FLOAT_NA, - R_INT_NA as R_INT_NA, CharFlags as CharFlags, RData as RData, RExtraInfo as RExtraInfo, @@ -11,7 +9,6 @@ RObjectInfo as RObjectInfo, RObjectType as RObjectType, RVersions as RVersions, - is_float_na as is_float_na, parse_data as parse_data, parse_file as parse_file, ) diff --git a/rdata/parser/_ascii.py b/rdata/parser/_ascii.py index 976d4df..f37c9e5 100644 --- a/rdata/parser/_ascii.py +++ b/rdata/parser/_ascii.py @@ -6,7 +6,9 @@ import numpy as np import numpy.typing as npt -from ._parser import R_FLOAT_NA, R_INT_NA, AltRepConstructorMap, Parser +from rdata.missing import R_FLOAT_NA, R_INT_NA + +from ._parser import AltRepConstructorMap, Parser def map_int_na(line: str) -> int: diff --git a/rdata/parser/_parser.py b/rdata/parser/_parser.py index aaa0123..b5e1570 100644 --- a/rdata/parser/_parser.py +++ b/rdata/parser/_parser.py @@ -23,24 +23,13 @@ import numpy as np import numpy.typing as npt +from rdata.missing import R_INT_NA + if TYPE_CHECKING: from ._ascii import ParserASCII from ._xdr import ParserXDR -#: Value used to represent a missing integer in R. -R_INT_NA: Final[int] = -2**31 - -#: Value used to represent a missing float in R. -# This is a NaN with a particular payload, but it's not the same as np.nan. -R_FLOAT_NA: Final[float] = np.frombuffer(b"\x7f\xf0\x00\x00\x00\x00\x07\xa2", dtype=">f8").astype("=f8")[0] # noqa: E501 - - -def is_float_na(value: float) -> bool: - """Check if value is NA value.""" - return np.array(value).tobytes() == np.array(R_FLOAT_NA).tobytes() - - @runtime_checkable class BinaryFileLike(Protocol): """Protocol for binary files.""" diff --git a/rdata/tests/test_rdata.py b/rdata/tests/test_rdata.py index d3c0a89..0a6ff54 100644 --- a/rdata/tests/test_rdata.py +++ b/rdata/tests/test_rdata.py @@ -13,6 +13,7 @@ import xarray import rdata +from rdata.missing import R_FLOAT_NA TESTDATA_PATH = rdata.TESTDATA_PATH @@ -584,7 +585,7 @@ def test_dataframe_dtypes_with_na(self) -> None: [10, 20, 30, pd.NA], dtype=pd.Int32Dtype(), index=index), "float": pd.Series( - [1.1, 2.2, 3.3, rdata.parser.R_FLOAT_NA], + [1.1, 2.2, 3.3, R_FLOAT_NA], dtype=float, index=index), "string": pd.Series( ["x" ,"y", "z", pd.NA], @@ -593,7 +594,7 @@ def test_dataframe_dtypes_with_na(self) -> None: [True, False, True, pd.NA], dtype=pd.BooleanDtype(), index=index), "complex": pd.Series( - [4+5j, 6+7j, 8+9j, rdata.parser.R_FLOAT_NA], + [4+5j, 6+7j, 8+9j, R_FLOAT_NA], dtype=complex, index=index), }, index=index, @@ -613,7 +614,7 @@ def test_dataframe_float_with_na_nan(self) -> None: ref = pd.DataFrame( { "float": pd.Series( - [1.1, 2.2, 3.3, rdata.parser.R_FLOAT_NA, np.nan, np.inf, -np.inf], + [1.1, 2.2, 3.3, R_FLOAT_NA, np.nan, np.inf, -np.inf], dtype=float, index=index), }, index=index, diff --git a/rdata/unparser/_ascii.py b/rdata/unparser/_ascii.py index 4ea9863..2fbd376 100644 --- a/rdata/unparser/_ascii.py +++ b/rdata/unparser/_ascii.py @@ -7,7 +7,7 @@ import numpy as np -from rdata.parser import is_float_na +from rdata.missing import is_float_na from ._unparser import Unparser diff --git a/rdata/unparser/_xdr.py b/rdata/unparser/_xdr.py index 8bea3f0..e2031e2 100644 --- a/rdata/unparser/_xdr.py +++ b/rdata/unparser/_xdr.py @@ -6,7 +6,7 @@ import numpy as np -from rdata.parser import R_INT_NA +from rdata.missing import R_INT_NA from ._unparser import Unparser From c63dfe774060e99aa810c9f1bb81043839e99c03 Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Mon, 16 Sep 2024 11:37:37 +0300 Subject: [PATCH 054/100] Add helper functions for handling NA values --- rdata/missing.py | 91 +++++++++++++++++++++++++++++++++++++--- rdata/parser/_parser.py | 14 +------ rdata/unparser/_ascii.py | 4 +- 3 files changed, 90 insertions(+), 19 deletions(-) diff --git a/rdata/missing.py b/rdata/missing.py index 0a931b4..8ce1e7b 100644 --- a/rdata/missing.py +++ b/rdata/missing.py @@ -1,17 +1,98 @@ """Utilities for missing (NA) values in R.""" -from typing import Final +from __future__ import annotations + +from typing import TYPE_CHECKING import numpy as np +if TYPE_CHECKING: + from typing import Any, Final + + import numpy.typing as npt + + #: Value used to represent a missing integer in R. -R_INT_NA: Final[int] = -2**31 +R_INT_NA: Final[int] = np.int32(-2**31) # type: ignore [assignment] #: Value used to represent a missing float in R. # This is a NaN with a particular payload, but it's not the same as np.nan. R_FLOAT_NA: Final[float] = np.frombuffer(b"\x7f\xf0\x00\x00\x00\x00\x07\xa2", dtype=">f8").astype("=f8")[0] # noqa: E501 -def is_float_na(value: float) -> bool: - """Check if value is NA value.""" - return np.array(value).tobytes() == np.array(R_FLOAT_NA).tobytes() +def get_na_value(dtype: np.dtype[Any]) -> Any: # noqa: ANN401 + """ + Get NA value for a given type. + + Args: + dtype: NumPy dtype. + + Returns: + NA value of given dtype. + """ + if dtype == np.int32: + return R_INT_NA + if dtype == np.float64: + return R_FLOAT_NA + msg = f"NA for numpy dtype {dtype} not implemented" + raise NotImplementedError(msg) + + +def is_na( + array: np.int32 | np.float64 | npt.NDArray[np.int32 | np.float64], +) -> bool | npt.NDArray[np.bool_]: + """ + Check if the array elements are NA. + + Args: + array: NumPy array or single value. + + Returns: + Boolean mask of NA values in the array. + """ + if isinstance(array, np.ndarray): + dtype = array.dtype + na = get_na_value(dtype) + if dtype == np.int32: + # Use the native dtype for comparison when possible; + # slightly faster than the steps below + return array == na # type: ignore [no-any-return] + raw_dtype = f"V{array.dtype.itemsize}" + return array.view(raw_dtype) == np.array(na).view(raw_dtype) # type: ignore [no-any-return] + + if isinstance(array, (np.int32, np.float64)): + return is_na(np.array(array)) + + msg = f"NA for {type(array)} not implemented" + raise NotImplementedError(msg) + + +def mask_na_values( + array: npt.NDArray[Any], + *, + fill_value: Any | None = None, # noqa: ANN401 +) -> npt.NDArray[Any] | np.ma.MaskedArray[Any, Any]: + """ + Mask NA elements of the array. + + Args: + array: NumPy array. + fill_value: Fill value for the masked array. + Defaults to the NA value. + + Returns: + NumPy masked array with NA values as the mask + or the original array if there is no NA elements. + """ + mask = is_na(array) + if np.any(mask): + if fill_value is None: + fill_value = get_na_value(array.dtype) + + array[mask] = fill_value + return np.ma.array( # type: ignore [no-untyped-call,no-any-return] + data=array, + mask=mask, + fill_value=fill_value, + ) + return array diff --git a/rdata/parser/_parser.py b/rdata/parser/_parser.py index b5e1570..82b080a 100644 --- a/rdata/parser/_parser.py +++ b/rdata/parser/_parser.py @@ -23,7 +23,7 @@ import numpy as np import numpy.typing as npt -from rdata.missing import R_INT_NA +from rdata.missing import R_INT_NA, mask_na_values if TYPE_CHECKING: from ._ascii import ParserASCII @@ -606,17 +606,7 @@ def parse_nullable_int_array( ) -> npt.NDArray[np.int32] | np.ma.MaskedArray[Any, Any]: """Parse an integer array.""" data = self._parse_array(np.int32) - mask = (data == R_INT_NA) - data[mask] = fill_value - - if np.any(mask): - return np.ma.array( # type: ignore [no-untyped-call,no-any-return] - data=data, - mask=mask, - fill_value=fill_value, - ) - - return data + return mask_na_values(data, fill_value=fill_value) def parse_double_array(self) -> npt.NDArray[np.float64]: """Parse a double array.""" diff --git a/rdata/unparser/_ascii.py b/rdata/unparser/_ascii.py index 2fbd376..36b83ae 100644 --- a/rdata/unparser/_ascii.py +++ b/rdata/unparser/_ascii.py @@ -7,7 +7,7 @@ import numpy as np -from rdata.missing import is_float_na +from rdata.missing import is_na from ._unparser import Unparser @@ -53,7 +53,7 @@ def _unparse_array_values(self, array: npt.NDArray[Any]) -> None: # noqa: C901 line = "NA" if value is None or np.ma.is_masked(value) else str(value) # type: ignore [no-untyped-call] elif np.issubdtype(array.dtype, np.floating): - if is_float_na(value): + if is_na(value): line = "NA" elif np.isnan(value): line = "NaN" From b8b6948f82e60a5b832a8c5d9c00b82b2251dc33 Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Mon, 16 Sep 2024 13:01:43 +0300 Subject: [PATCH 055/100] Add comment on setting mask --- rdata/conversion/_conversion.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/rdata/conversion/_conversion.py b/rdata/conversion/_conversion.py index 57644c6..bd80070 100644 --- a/rdata/conversion/_conversion.py +++ b/rdata/conversion/_conversion.py @@ -453,6 +453,9 @@ def _dataframe_column_transform(source: Any) -> Any: # noqa: ANN401 return source # This would create an array with all NaNs as "missing": # dtype = pd.Float64Dtype() # noqa: ERA001 + # This would create an array with only R_FLOAT_NA as "missing": + # from rdata.missing import is_na # noqa: ERA001 + # return pd.arrays.FloatingArray(source, is_na(source)) # noqa: ERA001 elif np.issubdtype(source.dtype, np.complexfloating): # There seems to be no pandas type for complex array return source From e773101a2bd6e9a429b226a42b465bef515f6400 Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Mon, 16 Sep 2024 13:39:33 +0300 Subject: [PATCH 056/100] Add tests for missing value functionality --- rdata/tests/test_missing.py | 90 +++++++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) create mode 100644 rdata/tests/test_missing.py diff --git a/rdata/tests/test_missing.py b/rdata/tests/test_missing.py new file mode 100644 index 0000000..ac3e22b --- /dev/null +++ b/rdata/tests/test_missing.py @@ -0,0 +1,90 @@ +"""Tests of missing value functionality.""" + +from __future__ import annotations + +from typing import Any + +import numpy as np +import pytest + +from rdata.missing import R_FLOAT_NA, R_INT_NA, is_na, mask_na_values + + +def test_int_is_na() -> None: + """Test checking NA values in int array.""" + array = np.array([1, 2, R_INT_NA], dtype=np.int32) + ref_mask = np.array([0, 0, 1], dtype=np.bool_) + + mask = is_na(array) + np.testing.assert_array_equal(mask, ref_mask) + + +def test_float_is_na() -> None: + """Test checking NA values in float array.""" + array = np.array([1, 2, R_FLOAT_NA, np.nan], dtype=np.float64) + ref_mask = np.array([0, 0, 1, 0], dtype=np.bool_) + + mask = is_na(array) + np.testing.assert_array_equal(mask, ref_mask) + + +@pytest.mark.parametrize("value", [R_INT_NA, R_FLOAT_NA]) +def test_value_is_na(value: Any) -> None: # noqa: ANN401 + """Test checking single NA values.""" + assert is_na(value) + + +@pytest.mark.parametrize("value", [ + np.int32(0), 0, np.float64(0.0), 0.0, np.nan, +]) +def test_value_is_not_na(value: Any) -> None: # noqa: ANN401 + """Test checking single NA values.""" + assert not is_na(value) + + +def test_int64() -> None: + """Test checking int64.""" + with pytest.raises(NotImplementedError): + is_na(2**32) + with pytest.raises(NotImplementedError): + is_na(-2**32) + + +def test_wrong_type() -> None: + """Test checking int64.""" + with pytest.raises(NotImplementedError): + is_na("test") + + +def test_masked_array() -> None: + """Test checking masked array creation.""" + array = np.array([1, 2, R_FLOAT_NA, np.nan], dtype=np.float64) + ref_mask = np.array([0, 0, 1, 0], dtype=np.bool_) + ref_data = array.copy() + + masked = mask_na_values(array) + assert isinstance(masked, np.ma.MaskedArray) + np.testing.assert_array_equal(masked.data, ref_data) + np.testing.assert_array_equal(masked.mask, ref_mask) + + +def test_masked_array_fill() -> None: + """Test checking masked array creation.""" + array = np.array([1, 2, R_FLOAT_NA, np.nan], dtype=np.float64) + ref_mask = np.array([0, 0, 1, 0], dtype=np.bool_) + ref_data = array.copy() + ref_data[ref_mask] = 42 + + masked = mask_na_values(array, fill_value=42) + assert isinstance(masked, np.ma.MaskedArray) + np.testing.assert_array_equal(masked.data, ref_data) + np.testing.assert_array_equal(masked.mask, ref_mask) + + +def test_nonmasked_array() -> None: + """Test checking masked array no-op.""" + array = np.array([1, 2, np.nan, np.nan], dtype=np.float64) + + masked = mask_na_values(array) + assert not isinstance(masked, np.ma.MaskedArray) + np.testing.assert_array_equal(masked, array) From 828f9daef4f7c5ff0aa94504c5581379409baf81 Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Mon, 16 Sep 2024 13:39:51 +0300 Subject: [PATCH 057/100] Include checking int and float values --- rdata/missing.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/rdata/missing.py b/rdata/missing.py index 8ce1e7b..8e6702f 100644 --- a/rdata/missing.py +++ b/rdata/missing.py @@ -39,7 +39,7 @@ def get_na_value(dtype: np.dtype[Any]) -> Any: # noqa: ANN401 def is_na( - array: np.int32 | np.float64 | npt.NDArray[np.int32 | np.float64], + array: Any | npt.NDArray[Any], # noqa: ANN401 ) -> bool | npt.NDArray[np.bool_]: """ Check if the array elements are NA. @@ -60,7 +60,13 @@ def is_na( raw_dtype = f"V{array.dtype.itemsize}" return array.view(raw_dtype) == np.array(na).view(raw_dtype) # type: ignore [no-any-return] - if isinstance(array, (np.int32, np.float64)): + if isinstance(array, int): + try: + return is_na(np.array(array, dtype=np.int32)) + except OverflowError: + return is_na(np.array(array)) + + if isinstance(array, (float, np.int32, np.float64)): return is_na(np.array(array)) msg = f"NA for {type(array)} not implemented" From 0fe903f2d1208a687d379ecae6185bd859783e03 Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Mon, 16 Sep 2024 14:29:12 +0300 Subject: [PATCH 058/100] Include ascii format in testing too large ints --- rdata/tests/test_write.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/rdata/tests/test_write.py b/rdata/tests/test_write.py index edef9e7..7e92312 100644 --- a/rdata/tests/test_write.py +++ b/rdata/tests/test_write.py @@ -206,13 +206,14 @@ def test_convert_to_r_unsupported_encoding() -> None: converter.convert_to_r_object("ä") -def test_unparse_big_int() -> None: +@pytest.mark.parametrize("file_format", valid_formats) +def test_unparse_big_int(file_format: FileFormat) -> None: """Test checking too large integers.""" big_int = 2**32 converter = ConverterFromPythonToR() r_data = converter.convert_to_r_data(big_int) with pytest.raises(ValueError, match="(?i)not castable"): - unparse_data(r_data, file_format="xdr") + unparse_data(r_data, file_format=file_format) def test_convert_dataframe_pandas_dtypes() -> None: From 2755b3c39d393b5fef6e8943822eae1293fe6f8b Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Mon, 16 Sep 2024 14:30:38 +0300 Subject: [PATCH 059/100] Fix datatype conversions in ascii unparser --- rdata/unparser/_ascii.py | 12 +++++------- rdata/unparser/_unparser.py | 26 +++++++++++++++++++++++++- rdata/unparser/_xdr.py | 27 +++++---------------------- 3 files changed, 35 insertions(+), 30 deletions(-) diff --git a/rdata/unparser/_ascii.py b/rdata/unparser/_ascii.py index 36b83ae..3219d5b 100644 --- a/rdata/unparser/_ascii.py +++ b/rdata/unparser/_ascii.py @@ -3,7 +3,7 @@ from __future__ import annotations import string -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING import numpy as np @@ -37,11 +37,9 @@ def unparse_magic(self) -> None: """Unparse magic bits.""" self._add_line("A") - def _unparse_array_values(self, array: npt.NDArray[Any]) -> None: # noqa: C901 - # Convert boolean to int - if np.issubdtype(array.dtype, np.bool_): - array = array.astype(np.int32) - + def _unparse_array_values_raw(self, + array: npt.NDArray[np.int32 | np.float64 | np.complex128], + ) -> None: # Convert complex to pairs of floats if np.issubdtype(array.dtype, np.complexfloating): assert array.dtype == np.complex128 @@ -50,7 +48,7 @@ def _unparse_array_values(self, array: npt.NDArray[Any]) -> None: # noqa: C901 # Unparse data for value in array: if np.issubdtype(array.dtype, np.integer): - line = "NA" if value is None or np.ma.is_masked(value) else str(value) # type: ignore [no-untyped-call] + line = "NA" if is_na(value) else str(value) elif np.issubdtype(array.dtype, np.floating): if is_na(value): diff --git a/rdata/unparser/_unparser.py b/rdata/unparser/_unparser.py index 7dd6243..0b57705 100644 --- a/rdata/unparser/_unparser.py +++ b/rdata/unparser/_unparser.py @@ -7,6 +7,7 @@ import numpy as np +from rdata.missing import R_INT_NA from rdata.parser import ( RData, RExtraInfo, @@ -69,9 +70,32 @@ def unparse_array(self, array: npt.NDArray[Any]) -> None: self.unparse_int(array.size) self._unparse_array_values(array) - @abc.abstractmethod def _unparse_array_values(self, array: npt.NDArray[Any]) -> None: """Unparse the values of an array.""" + # Convert boolean to int + if np.issubdtype(array.dtype, np.bool_): + array = array.astype(np.int32) + + # Flatten masked values and convert int arrays to int32 + if np.issubdtype(array.dtype, np.integer): + if np.ma.is_masked(array): # type: ignore [no-untyped-call] + mask = np.ma.getmask(array) # type: ignore [no-untyped-call] + array = np.ma.getdata(array).copy() # type: ignore [no-untyped-call] + array[mask] = R_INT_NA + info = np.iinfo(np.int32) + if not all(info.min <= val <= info.max for val in array): + msg = "Integer array not castable to int32" + raise ValueError(msg) + array = array.astype(np.int32) + + assert array.dtype in (np.int32, np.float64, np.complex128) + self._unparse_array_values_raw(array) + + @abc.abstractmethod + def _unparse_array_values_raw(self, + array: npt.NDArray[np.int32 | np.float64 | np.complex128], + ) -> None: + """Unparse the values of an array as such.""" @abc.abstractmethod def unparse_string(self, value: bytes) -> None: diff --git a/rdata/unparser/_xdr.py b/rdata/unparser/_xdr.py index e2031e2..664d95b 100644 --- a/rdata/unparser/_xdr.py +++ b/rdata/unparser/_xdr.py @@ -2,17 +2,14 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any - -import numpy as np - -from rdata.missing import R_INT_NA +from typing import TYPE_CHECKING from ._unparser import Unparser if TYPE_CHECKING: import io + import numpy as np import numpy.typing as npt @@ -30,23 +27,9 @@ def unparse_magic(self) -> None: """Unparse magic bits.""" self.file.write(b"X\n") - def _unparse_array_values(self, array: npt.NDArray[Any]) -> None: - # Convert boolean to int - if np.issubdtype(array.dtype, np.bool_): - array = array.astype(np.int32) - - # Flatten masked values and convert int arrays to int32 - if np.issubdtype(array.dtype, np.integer): - if np.ma.is_masked(array): # type: ignore [no-untyped-call] - mask = np.ma.getmask(array) # type: ignore [no-untyped-call] - array = np.ma.getdata(array).copy() # type: ignore [no-untyped-call] - array[mask] = R_INT_NA - info = np.iinfo(np.int32) - if not all(info.min <= val <= info.max for val in array): - msg = "Integer array not castable to int32" - raise ValueError(msg) - array = array.astype(np.int32) - + def _unparse_array_values_raw(self, + array: npt.NDArray[np.int32 | np.float64 | np.complex128], + ) -> None: # Convert to big endian if needed array = array.astype(array.dtype.newbyteorder(">")) From 6040baeb30d791332a4421021de00db07bb546e7 Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Mon, 16 Sep 2024 14:42:10 +0300 Subject: [PATCH 060/100] Include testing negative end --- rdata/tests/test_write.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/rdata/tests/test_write.py b/rdata/tests/test_write.py index 7e92312..3d6a9e3 100644 --- a/rdata/tests/test_write.py +++ b/rdata/tests/test_write.py @@ -207,11 +207,11 @@ def test_convert_to_r_unsupported_encoding() -> None: @pytest.mark.parametrize("file_format", valid_formats) -def test_unparse_big_int(file_format: FileFormat) -> None: +@pytest.mark.parametrize("value", [-2**31 - 1, 2**31]) +def test_unparse_big_int(file_format: FileFormat, value: int) -> None: """Test checking too large integers.""" - big_int = 2**32 converter = ConverterFromPythonToR() - r_data = converter.convert_to_r_data(big_int) + r_data = converter.convert_to_r_data(value) with pytest.raises(ValueError, match="(?i)not castable"): unparse_data(r_data, file_format=file_format) From 7c605661b77fbd48777a06dbd3ef415faffbd8f1 Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Mon, 16 Sep 2024 14:42:41 +0300 Subject: [PATCH 061/100] Speed up range check --- rdata/unparser/_unparser.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/rdata/unparser/_unparser.py b/rdata/unparser/_unparser.py index 0b57705..b41647e 100644 --- a/rdata/unparser/_unparser.py +++ b/rdata/unparser/_unparser.py @@ -82,11 +82,13 @@ def _unparse_array_values(self, array: npt.NDArray[Any]) -> None: mask = np.ma.getmask(array) # type: ignore [no-untyped-call] array = np.ma.getdata(array).copy() # type: ignore [no-untyped-call] array[mask] = R_INT_NA - info = np.iinfo(np.int32) - if not all(info.min <= val <= info.max for val in array): - msg = "Integer array not castable to int32" - raise ValueError(msg) - array = array.astype(np.int32) + + if array.dtype != np.int32: + info = np.iinfo(np.int32) + if np.any((array < info.min) | (array > info.max)): + msg = "Integer array not castable to int32" + raise ValueError(msg) + array = array.astype(np.int32) assert array.dtype in (np.int32, np.float64, np.complex128) self._unparse_array_values_raw(array) From d450f7306cd13f2159b70ce10e460ab45c58d76c Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Mon, 16 Sep 2024 15:38:32 +0300 Subject: [PATCH 062/100] Move duplicated code to a function --- rdata/conversion/_conversion.py | 14 +++----------- rdata/parser/_parser.py | 29 +++++++++++++++++------------ 2 files changed, 20 insertions(+), 23 deletions(-) diff --git a/rdata/conversion/_conversion.py b/rdata/conversion/_conversion.py index bd80070..eb1dc5b 100644 --- a/rdata/conversion/_conversion.py +++ b/rdata/conversion/_conversion.py @@ -14,6 +14,8 @@ import xarray from typing_extensions import override +from rdata.parser._parser import get_altrep_name + from .. import parser ConversionFunction = Callable[[Union[parser.RData, parser.RObject]], Any] @@ -416,17 +418,7 @@ def convert_altrep_to_range( info, state, attr = r_altrep.value assert attr.info.type == parser.RObjectType.NILVALUE - assert info.info.type == parser.RObjectType.LIST - - class_sym = info.value[0] - while class_sym.info.type == parser.RObjectType.REF: - class_sym = class_sym.referenced_object - - assert class_sym.info.type == parser.RObjectType.SYM - assert class_sym.value.info.type == parser.RObjectType.CHAR - - altrep_name = class_sym.value.value - assert isinstance(altrep_name, bytes) + altrep_name = get_altrep_name(info) if altrep_name != b"compact_intseq": msg = "Only compact integer sequences can be converted to range" diff --git a/rdata/parser/_parser.py b/rdata/parser/_parser.py index 82b080a..44661bc 100644 --- a/rdata/parser/_parser.py +++ b/rdata/parser/_parser.py @@ -538,6 +538,22 @@ def wrap_constructor( return new_info, value +def get_altrep_name(info: RObject) -> bytes: + """Get the name of the ALTREP object.""" + assert info.info.type == RObjectType.LIST + + class_sym = info.value[0] + while class_sym.info.type == RObjectType.REF: + class_sym = class_sym.referenced_object + + assert class_sym.info.type == RObjectType.SYM + assert class_sym.value.info.type == RObjectType.CHAR + + altrep_name = class_sym.value.value + assert isinstance(altrep_name, bytes) + return altrep_name + + default_altrep_map_dict: Final[Mapping[bytes, AltRepConstructor]] = { b"deferred_string": deferred_string_constructor, b"compact_intseq": compact_intseq_constructor, @@ -666,18 +682,7 @@ def expand_altrep_to_object( state: RObject, ) -> tuple[RObjectInfo, Any]: """Expand alternative representation to normal object.""" - assert info.info.type == RObjectType.LIST - - class_sym = info.value[0] - while class_sym.info.type == RObjectType.REF: - class_sym = class_sym.referenced_object - - assert class_sym.info.type == RObjectType.SYM - assert class_sym.value.info.type == RObjectType.CHAR - - altrep_name = class_sym.value.value - assert isinstance(altrep_name, bytes) - + altrep_name = get_altrep_name(info) constructor = self.altrep_constructor_dict[altrep_name] return constructor(state) From 4d3e38f323d49451735f458a91883c059ea89e62 Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Mon, 16 Sep 2024 17:25:41 +0300 Subject: [PATCH 063/100] Speed up range check --- rdata/unparser/_unparser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rdata/unparser/_unparser.py b/rdata/unparser/_unparser.py index b41647e..ae15914 100644 --- a/rdata/unparser/_unparser.py +++ b/rdata/unparser/_unparser.py @@ -85,7 +85,7 @@ def _unparse_array_values(self, array: npt.NDArray[Any]) -> None: if array.dtype != np.int32: info = np.iinfo(np.int32) - if np.any((array < info.min) | (array > info.max)): + if np.any(array > info.max) or np.any(array < info.min): msg = "Integer array not castable to int32" raise ValueError(msg) array = array.astype(np.int32) From ac948de64e23a45606952ab5ccf03e383d7f70ad Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Mon, 16 Sep 2024 17:52:38 +0300 Subject: [PATCH 064/100] Fix docstring --- rdata/conversion/_conversion.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/rdata/conversion/_conversion.py b/rdata/conversion/_conversion.py index eb1dc5b..8530b8a 100644 --- a/rdata/conversion/_conversion.py +++ b/rdata/conversion/_conversion.py @@ -406,10 +406,7 @@ def convert_altrep_to_range( r_altrep: R altrep object Returns: - Array. - - See Also: - convert_array + Range object. """ if r_altrep.info.type != parser.RObjectType.ALTREP: msg = "Must receive an altrep object" From 957887c60a3f322e523d6ae2ac9916f147f57a38 Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Thu, 19 Sep 2024 09:48:31 +0300 Subject: [PATCH 065/100] Simplify the definition of the NA value --- rdata/missing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rdata/missing.py b/rdata/missing.py index 8e6702f..b60a6f8 100644 --- a/rdata/missing.py +++ b/rdata/missing.py @@ -17,7 +17,7 @@ #: Value used to represent a missing float in R. # This is a NaN with a particular payload, but it's not the same as np.nan. -R_FLOAT_NA: Final[float] = np.frombuffer(b"\x7f\xf0\x00\x00\x00\x00\x07\xa2", dtype=">f8").astype("=f8")[0] # noqa: E501 +R_FLOAT_NA: Final[float] = np.uint64(0x7ff00000000007a2).view(np.float64) # type: ignore [assignment] def get_na_value(dtype: np.dtype[Any]) -> Any: # noqa: ANN401 From c3fe17c66e407de9db8f3e31b1acf9daacdd4b28 Mon Sep 17 00:00:00 2001 From: Tuomas Rossi <34502776+trossi@users.noreply.github.com> Date: Wed, 2 Oct 2024 09:57:27 +0300 Subject: [PATCH 066/100] Apply suggestions from code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Ramos Carreño --- rdata/conversion/to_r.py | 14 +++++++++----- rdata/tests/test_write.py | 2 +- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/rdata/conversion/to_r.py b/rdata/conversion/to_r.py index a2add50..8415504 100644 --- a/rdata/conversion/to_r.py +++ b/rdata/conversion/to_r.py @@ -50,7 +50,7 @@ def convert_pd_array_to_np_array( - pd_array: Any, # noqa: ANN401 + pd_array: pd.api.extensions.ExtensionArray, ) -> npt.NDArray[Any]: """ Convert pandas array object to numpy array. @@ -527,10 +527,14 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 else: r_attributes = None - return build_r_object(r_type, value=r_value, - is_object=is_object, - attributes=r_attributes, - tag=tag, gp=gp) + return build_r_object( + r_type, + value=r_value, + is_object=is_object, + attributes=r_attributes, + tag=tag, + gp=gp, + ) def convert_python_to_r_data( diff --git a/rdata/tests/test_write.py b/rdata/tests/test_write.py index 3d6a9e3..8620d3a 100644 --- a/rdata/tests/test_write.py +++ b/rdata/tests/test_write.py @@ -154,7 +154,7 @@ def test_convert_to_r(fname: str, expand_altrep: bool) -> None: # noqa: FBT001 assert str(r_data) == str(new_r_data) assert r_data == new_r_data - # Check futher that the resulting unparsed data is correct to ensure that + # Check further that the resulting unparsed data is correct to ensure that # Python-to-R conversion hasn't created any odd objects that can't be unparsed if not expand_altrep: file_type, file_format = parse_file_type_and_format(data) From 74a11ba76c834d8db20d3686a17e762dce1ffd84 Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Wed, 2 Oct 2024 10:01:52 +0300 Subject: [PATCH 067/100] Fix indentation --- rdata/conversion/to_r.py | 78 ++++++++++++++++++++-------------------- 1 file changed, 39 insertions(+), 39 deletions(-) diff --git a/rdata/conversion/to_r.py b/rdata/conversion/to_r.py index 8415504..da7427d 100644 --- a/rdata/conversion/to_r.py +++ b/rdata/conversion/to_r.py @@ -112,14 +112,14 @@ def convert_pd_array_to_np_array( def build_r_object( - r_type: RObjectType, - *, - value: Any = None, # noqa: ANN401 - is_object: bool = False, - attributes: RObject | None = None, - tag: RObject | None = None, - gp: int = 0, - reference: tuple[int, RObject | None] = (0, None), + r_type: RObjectType, + *, + value: Any = None, # noqa: ANN401 + is_object: bool = False, + attributes: RObject | None = None, + tag: RObject | None = None, + gp: int = 0, + reference: tuple[int, RObject | None] = (0, None), ) -> RObject: """ Build R object. @@ -163,7 +163,7 @@ def build_r_object( def build_r_list( - data: list[RObject] | list[tuple[RObject, RObject]], + data: list[RObject] | list[tuple[RObject, RObject]], ) -> RObject: """ Build R object representing (named) linked list. @@ -201,9 +201,9 @@ class ConverterFromPythonToR: r_version_serialized: R version written as the creator of the object. """ def __init__(self, *, - encoding: Encoding = "utf-8", - format_version: int = DEFAULT_FORMAT_VERSION, - r_version_serialized: int = DEFAULT_R_VERSION_SERIALIZED, + encoding: Encoding = "utf-8", + format_version: int = DEFAULT_FORMAT_VERSION, + r_version_serialized: int = DEFAULT_R_VERSION_SERIALIZED, ) -> None: """ Init class. @@ -226,9 +226,9 @@ def __init__(self, *, def convert_to_r_data(self, - data: Any, # noqa: ANN401 - *, - file_type: FileType = "rds", + data: Any, # noqa: ANN401 + *, + file_type: FileType = "rds", ) -> RData: """ Convert Python data to R data. @@ -265,7 +265,7 @@ def convert_to_r_data(self, def convert_to_r_attributes(self, - data: dict[str, Any], + data: dict[str, Any], ) -> RObject: """ Convert dictionary to R attributes list. @@ -287,7 +287,7 @@ def convert_to_r_attributes(self, def convert_to_r_sym(self, - name: str, + name: str, ) -> RObject: """ Convert string to R symbol. @@ -313,7 +313,7 @@ def convert_to_r_sym(self, def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 - data: Any, # noqa: ANN401 + data: Any, # noqa: ANN401 ) -> RObject: """ Convert Python data to R object. @@ -493,10 +493,10 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 and index.step == 1 ): row_names = np.ma.array( - data=[R_INT_NA, -data.shape[0]], - mask=[True, False], - fill_value=R_INT_NA, - ) + data=[R_INT_NA, -data.shape[0]], + mask=[True, False], + fill_value=R_INT_NA, + ) else: row_names = range(index.start, index.stop, index.step) elif isinstance(index, pd.Index): @@ -538,12 +538,12 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 def convert_python_to_r_data( - data: Any, # noqa: ANN401 - *, - encoding: Encoding = "utf-8", - format_version: int = DEFAULT_FORMAT_VERSION, - r_version_serialized: int = DEFAULT_R_VERSION_SERIALIZED, - file_type: FileType = "rds", + data: Any, # noqa: ANN401 + *, + encoding: Encoding = "utf-8", + format_version: int = DEFAULT_FORMAT_VERSION, + r_version_serialized: int = DEFAULT_R_VERSION_SERIALIZED, + file_type: FileType = "rds", ) -> RData: """ Convert Python data to R data. @@ -562,18 +562,18 @@ def convert_python_to_r_data( convert_python_to_r_object """ return ConverterFromPythonToR( - encoding=encoding, - format_version=format_version, - r_version_serialized=r_version_serialized, + encoding=encoding, + format_version=format_version, + r_version_serialized=r_version_serialized, ).convert_to_r_data(data, file_type=file_type) def convert_python_to_r_object( - data: Any, # noqa: ANN401 - *, - encoding: Encoding = "utf-8", - format_version: int = DEFAULT_FORMAT_VERSION, - r_version_serialized: int = DEFAULT_R_VERSION_SERIALIZED, + data: Any, # noqa: ANN401 + *, + encoding: Encoding = "utf-8", + format_version: int = DEFAULT_FORMAT_VERSION, + r_version_serialized: int = DEFAULT_R_VERSION_SERIALIZED, ) -> RObject: """ Convert Python data to R object. @@ -591,7 +591,7 @@ def convert_python_to_r_object( convert_python_to_r_data """ return ConverterFromPythonToR( - encoding=encoding, - format_version=format_version, - r_version_serialized=r_version_serialized, + encoding=encoding, + format_version=format_version, + r_version_serialized=r_version_serialized, ).convert_to_r_object(data) From 5449199e291928a02cbd397a92edd6548b7293a6 Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Wed, 2 Oct 2024 10:02:53 +0300 Subject: [PATCH 068/100] Fix mypy --- rdata/conversion/to_r.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rdata/conversion/to_r.py b/rdata/conversion/to_r.py index da7427d..3b849bd 100644 --- a/rdata/conversion/to_r.py +++ b/rdata/conversion/to_r.py @@ -77,7 +77,7 @@ def convert_pd_array_to_np_array( dtype = np.int32 fill_value = R_INT_NA - mask = pd_array.isna() + mask = pd_array.isna() # type: ignore [no-untyped-call] if np.any(mask): data = pd_array.to_numpy(dtype=dtype, na_value=fill_value) array = np.ma.array( # type: ignore [no-untyped-call] From 74bf9d8f60a80a3c1b7de1c32cc16cb22981d65a Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Wed, 2 Oct 2024 10:03:09 +0300 Subject: [PATCH 069/100] Comment code --- rdata/missing.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/rdata/missing.py b/rdata/missing.py index b60a6f8..2cad764 100644 --- a/rdata/missing.py +++ b/rdata/missing.py @@ -57,13 +57,18 @@ def is_na( # Use the native dtype for comparison when possible; # slightly faster than the steps below return array == na # type: ignore [no-any-return] - raw_dtype = f"V{array.dtype.itemsize}" + # Convert dtype to unsigned integer to perform byte-by-byte + # equality comparison to distinguish different NaN values + raw_dtype = f"u{array.dtype.itemsize}" return array.view(raw_dtype) == np.array(na).view(raw_dtype) # type: ignore [no-any-return] if isinstance(array, int): try: + # Python built-in integer is 64 bits or larger, so + # we try to cast it to 32-bit int if possible return is_na(np.array(array, dtype=np.int32)) except OverflowError: + # Proceed with larger integer (in case it is supported at some point) return is_na(np.array(array)) if isinstance(array, (float, np.int32, np.float64)): From da9c940b54fe363c5402f1c148243f2e0133579b Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Wed, 2 Oct 2024 10:05:36 +0300 Subject: [PATCH 070/100] Raise NotImplementedError for untested code --- rdata/parser/_parser.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/rdata/parser/_parser.py b/rdata/parser/_parser.py index 44661bc..8e7db16 100644 --- a/rdata/parser/_parser.py +++ b/rdata/parser/_parser.py @@ -935,9 +935,8 @@ def parse_R_object( # noqa: N802, C901, PLR0912, PLR0915 state=altrep_state, ) if altrep_attr.info.type != RObjectType.NILVALUE: - info.attributes = True - attributes_read = True - attributes = altrep_attr + msg = "altrep attributes not implemented" + raise NotImplementedError(msg) else: value = (altrep_info, altrep_state, altrep_attr) From 3acc18067b3ab2885acfcf5e11f17b20b58c6813 Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Wed, 2 Oct 2024 10:39:01 +0300 Subject: [PATCH 071/100] Separate pandas types to constructor functions --- rdata/conversion/to_r.py | 199 ++++++++++++++++++++++++++------------- 1 file changed, 133 insertions(+), 66 deletions(-) diff --git a/rdata/conversion/to_r.py b/rdata/conversion/to_r.py index 3b849bd..f3ed1f5 100644 --- a/rdata/conversion/to_r.py +++ b/rdata/conversion/to_r.py @@ -27,7 +27,7 @@ if TYPE_CHECKING: from collections.abc import Mapping - from typing import Any, Final, Literal + from typing import Any, Final, Literal, Protocol import numpy.typing as npt @@ -35,6 +35,24 @@ Encoding = Literal["utf-8", "cp1252"] + class ConversionFunction(Protocol): + """Protocol for Py-to-R conversion function.""" + + def __call__(self, data: Any) -> RObject: # noqa: ANN401 + """Convert Python object to R object.""" + + + class ConstructorFunction(Protocol): + """Protocol for Py-to-R constructor function.""" + + def __call__(self, + data: Any, # noqa: ANN401 + convert_to_r_object: ConversionFunction, + ) -> tuple[RObjectType, Any, dict[str, Any]]: + """Convert Python object to R object components.""" + + ConstructorDict = Mapping[type, ConstructorFunction] + # Default values for RVersions object DEFAULT_FORMAT_VERSION: Final[int] = 3 @@ -49,6 +67,100 @@ R_MINIMUM_VERSION_WITH_ALTREP: Final[int] = 3 +def categorical_constructor( + data: pd.Categorical, + convert_to_r_object: ConversionFunction, # noqa: ARG001 +) -> tuple[RObjectType, Any, dict[str, Any]]: + """ + Construct R object components from pandas categorical. + + Args: + data: Pandas categorical. + convert_to_r_object: Conversion function. + + Returns: + Components of the R object. + """ + assert isinstance(data, pd.Categorical) + r_type = RObjectType.INT + r_value = data.codes + 1 + attributes = { + "levels": data.categories.to_numpy(), + "class": "factor", + } + return r_type, r_value, attributes + + +def dataframe_constructor( + data: pd.DataFrame, + convert_to_r_object: ConversionFunction, +) -> tuple[RObjectType, Any, dict[str, Any]]: + """ + Construct R object components from pandas dataframe. + + Args: + data: Pandas dataframe. + convert_to_r_object: Conversion function. + + Returns: + Components of the R object. + """ + assert isinstance(data, pd.DataFrame) + r_type = RObjectType.VEC + column_names = [] + r_value = [] + for column, series in data.items(): + assert isinstance(column, str) + column_names.append(column) + + pd_array = series.array + array: pd.Categorical | npt.NDArray[Any] + if isinstance(pd_array, pd.Categorical): + array = pd_array + else: + array = convert_pd_array_to_np_array(pd_array) + r_series = convert_to_r_object(array) + r_value.append(r_series) + + index = data.index + if isinstance(index, pd.RangeIndex): + assert isinstance(index.start, int) + if (index.start == 1 + and index.stop == data.shape[0] + 1 + and index.step == 1 + ): + row_names = np.ma.array( + data=[R_INT_NA, -data.shape[0]], + mask=[True, False], + fill_value=R_INT_NA, + ) + else: + row_names = range(index.start, index.stop, index.step) + elif isinstance(index, pd.Index): + if (index.dtype == "object" + or np.issubdtype(str(index.dtype), np.integer)): + row_names = index.to_numpy() + else: + msg = f"pd.DataFrame pd.Index {index.dtype} not implemented" + raise NotImplementedError(msg) + else: + msg = f"pd.DataFrame index {type(index)} not implemented" + raise NotImplementedError(msg) + + attributes = { + "names": np.array(column_names, dtype=np.dtype("U")), + "class": "data.frame", + "row.names": row_names, + } + return r_type, r_value, attributes + + +DEFAULT_CONSTRUCTOR_DICT: Final[ConstructorDict] = MappingProxyType({ + pd.Categorical: categorical_constructor, + pd.DataFrame: dataframe_constructor, +}) + + def convert_pd_array_to_np_array( pd_array: pd.api.extensions.ExtensionArray, ) -> npt.NDArray[Any]: @@ -196,11 +308,14 @@ class ConverterFromPythonToR: Class converting Python objects to R objects. Attributes: + constructor_dict: Dictionary mapping Python types to R classes. encoding: Encoding to be used for strings within data. format_version: File format version. r_version_serialized: R version written as the creator of the object. """ - def __init__(self, *, + def __init__(self, + constructor_dict: ConstructorDict = DEFAULT_CONSTRUCTOR_DICT, + *, encoding: Encoding = "utf-8", format_version: int = DEFAULT_FORMAT_VERSION, r_version_serialized: int = DEFAULT_R_VERSION_SERIALIZED, @@ -209,10 +324,12 @@ def __init__(self, *, Init class. Args: + constructor_dict: Dictionary mapping Python types to R classes. encoding: Encoding to be used for strings within data. format_version: File format version. r_version_serialized: R version written as the creator of the object. """ + self.constructor_dict = constructor_dict self.encoding = encoding self.format_version = format_version self.r_version_serialized = r_version_serialized @@ -454,75 +571,25 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 self.convert_to_r_object(None), ) - elif isinstance(data, pd.Series): - msg = "pd.Series not implemented" - raise NotImplementedError(msg) - - elif isinstance(data, pd.Categorical): - is_object = True - r_type = RObjectType.INT - r_value = data.codes + 1 - attributes = { - "levels": data.categories.to_numpy(), - "class": "factor", - } - - elif isinstance(data, pd.DataFrame): - is_object = True - r_type = RObjectType.VEC - column_names = [] - r_value = [] - for column, series in data.items(): - assert isinstance(column, str) - column_names.append(column) - - pd_array = series.array - array: pd.Categorical | npt.NDArray[Any] - if isinstance(pd_array, pd.Categorical): - array = pd_array - else: - array = convert_pd_array_to_np_array(pd_array) - r_series = self.convert_to_r_object(array) - r_value.append(r_series) - - index = data.index - if isinstance(index, pd.RangeIndex): - assert isinstance(index.start, int) - if (index.start == 1 - and index.stop == data.shape[0] + 1 - and index.step == 1 - ): - row_names = np.ma.array( - data=[R_INT_NA, -data.shape[0]], - mask=[True, False], - fill_value=R_INT_NA, - ) - else: - row_names = range(index.start, index.stop, index.step) - elif isinstance(index, pd.Index): - if (index.dtype == "object" - or np.issubdtype(str(index.dtype), np.integer)): - row_names = index.to_numpy() - else: - msg = f"pd.DataFrame pd.Index {index.dtype} not implemented" - raise NotImplementedError(msg) - else: - msg = f"pd.DataFrame index {type(index)} not implemented" + else: + # Check available constructors + for t, constructor in self.constructor_dict.items(): + if isinstance(data, t): + r_type, r_value, attributes \ + = constructor(data, self.convert_to_r_object) + break + + if r_type is None: + msg = f"type {type(data)} not implemented" raise NotImplementedError(msg) - attributes = { - "names": np.array(column_names, dtype=np.dtype("U")), - "class": "data.frame", - "row.names": row_names, - } - if self.df_attr_order is not None: + # Fix for test files where dataframe attribute order varies + assert isinstance(attributes, dict) + if isinstance(data, pd.DataFrame) and self.df_attr_order is not None: attributes = {k: attributes[k] for k in self.df_attr_order} - else: - msg = f"type {type(data)} not implemented" - raise NotImplementedError(msg) - if attributes is not None: + is_object = "class" in attributes r_attributes = self.convert_to_r_attributes(attributes) else: r_attributes = None From 4557d992df6715643b215c1d2529d3972e8327c9 Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Wed, 2 Oct 2024 10:41:33 +0300 Subject: [PATCH 072/100] Separate string builders to functions --- rdata/conversion/to_r.py | 94 +++++++++++++++++++++++++++------------- 1 file changed, 64 insertions(+), 30 deletions(-) diff --git a/rdata/conversion/to_r.py b/rdata/conversion/to_r.py index f3ed1f5..6f568d0 100644 --- a/rdata/conversion/to_r.py +++ b/rdata/conversion/to_r.py @@ -303,6 +303,63 @@ def build_r_list( return build_r_object(RObjectType.LIST, value=(car, cdr), tag=tag) +def build_r_str( + data: str, + *, + encoding: Encoding, +) -> RObject: + """ + Build R object representing string. + + Args: + data: String. + encoding: Encoding used for strings. + + Returns: + R object. + """ + value = [build_r_char(data, encoding=encoding)] + return build_r_object(RObjectType.STR, value=value) + + +def build_r_char( + data: str | bytes | None, + *, + encoding: Encoding, +) -> RObject: + """ + Build R object representing characters. + + Args: + data: String or bytestring. + encoding: Encoding used for strings. + + Returns: + R object. + """ + if data is None: + return build_r_object(RObjectType.CHAR) + + if isinstance(data, str): + data = data.encode(encoding) + + if all(chr(byte) in string.printable for byte in data): + gp = CharFlags.ASCII + elif encoding == "utf-8": + gp = CharFlags.UTF8 + elif encoding == "cp1252": + # Note! + # CP1252 and Latin1 are not the same. + # Does CharFlags.LATIN1 mean actually CP1252 + # as R on Windows mentions CP1252 as encoding? + # Or does CP1252 change to e.g. CP1250 depending on localization? + gp = CharFlags.LATIN1 + else: + msg = f"unsupported encoding: {encoding}" + raise ValueError(msg) + return build_r_object(RObjectType.CHAR, value=data, gp=gp) + + class ConverterFromPythonToR: """ Class converting Python objects to R objects. @@ -448,7 +505,6 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 is_object = False attributes: dict[str, Any] | None = None tag = None - gp = 0 if data is None: r_type = RObjectType.NILVALUE @@ -484,24 +540,19 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 r_value = [] for el in data: if el is None or pd.isna(el): - r_el = build_r_object(RObjectType.CHAR) + r_el = build_r_char(None, encoding=self.encoding) elif isinstance(el, str): - r_el = self.convert_to_r_object(el.encode(self.encoding)) + r_el = build_r_char(el, encoding=self.encoding) else: msg = "general object array not implemented" raise NotImplementedError(msg) r_value.append(r_el) - elif data.dtype.kind in ["S"]: + elif data.dtype.kind in ["S", "U"]: assert data.ndim == 1 r_type = RObjectType.STR - r_value = [self.convert_to_r_object(el) for el in data] - - elif data.dtype.kind in ["U"]: - assert data.ndim == 1 - data = np.array([s.encode(self.encoding) for s in data], - dtype=np.dtype("S")) - return self.convert_to_r_object(data) + r_value = [build_r_char(el, encoding=self.encoding) + for el in data] else: r_type = { @@ -524,26 +575,10 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 return self.convert_to_r_object(np.array(data)) elif isinstance(data, str): - r_type = RObjectType.STR - r_value = [self.convert_to_r_object(data.encode(self.encoding))] + return build_r_str(data, encoding=self.encoding) elif isinstance(data, bytes): - r_type = RObjectType.CHAR - if all(chr(byte) in string.printable for byte in data): - gp = CharFlags.ASCII - elif self.encoding == "utf-8": - gp = CharFlags.UTF8 - elif self.encoding == "cp1252": - # Note! - # CP1252 and Latin1 are not the same. - # Does CharFlags.LATIN1 mean actually CP1252 - # as R on Windows mentions CP1252 as encoding? - # Or does CP1252 change to e.g. CP1250 depending on localization? - gp = CharFlags.LATIN1 - else: - msg = f"unsupported encoding: {self.encoding}" - raise ValueError(msg) - r_value = data + return build_r_char(data, encoding=self.encoding) elif isinstance(data, range): if self.format_version < R_MINIMUM_VERSION_WITH_ALTREP: @@ -600,7 +635,6 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 is_object=is_object, attributes=r_attributes, tag=tag, - gp=gp, ) From 3e330a3d4942268d31e0f0d5cb5b9097e1d5c76d Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Wed, 2 Oct 2024 10:42:06 +0300 Subject: [PATCH 073/100] Remove unused variable --- rdata/conversion/to_r.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/rdata/conversion/to_r.py b/rdata/conversion/to_r.py index 6f568d0..f1729e8 100644 --- a/rdata/conversion/to_r.py +++ b/rdata/conversion/to_r.py @@ -504,7 +504,6 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 r_value: Any = None is_object = False attributes: dict[str, Any] | None = None - tag = None if data is None: r_type = RObjectType.NILVALUE @@ -634,7 +633,6 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 value=r_value, is_object=is_object, attributes=r_attributes, - tag=tag, ) From 0d4596593f6a66e545cbb89a37c59780a433a95d Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Wed, 2 Oct 2024 11:55:42 +0300 Subject: [PATCH 074/100] Convert all built-in types via numpy type --- rdata/conversion/to_r.py | 41 +++++++++++----------------------------- 1 file changed, 11 insertions(+), 30 deletions(-) diff --git a/rdata/conversion/to_r.py b/rdata/conversion/to_r.py index f1729e8..b8a9c9d 100644 --- a/rdata/conversion/to_r.py +++ b/rdata/conversion/to_r.py @@ -303,25 +303,6 @@ def build_r_list( return build_r_object(RObjectType.LIST, value=(car, cdr), tag=tag) -def build_r_str( - data: str, - *, - encoding: Encoding, -) -> RObject: - """ - Build R object representing string. - - Args: - data: String. - encoding: Encoding used for strings. - - Returns: - R object. - """ - value = [build_r_char(data, encoding=encoding)] - return build_r_object(RObjectType.STR, value=value) - - def build_r_char( data: str | bytes | None, *, @@ -533,6 +514,10 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 attributes = {"names": names} elif isinstance(data, np.ndarray): + # Promote 0-dimensional array to 1-dimensional array + if data.ndim == 0: + data = data[np.newaxis] + if data.dtype.kind in ["O"]: assert data.ndim == 1 r_type = RObjectType.STR @@ -547,7 +532,11 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 raise NotImplementedError(msg) r_value.append(r_el) - elif data.dtype.kind in ["S", "U"]: + elif data.dtype.kind in ["S"]: # bytes object is converted to this dtype + assert data.size == 1 + return build_r_char(data[0], encoding=self.encoding) + + elif data.dtype.kind in ["U"]: assert data.ndim == 1 r_type = RObjectType.STR r_value = [build_r_char(el, encoding=self.encoding) @@ -561,24 +550,16 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 "c": RObjectType.CPLX, }[data.dtype.kind] - if data.ndim == 0: - r_value = data[np.newaxis] - elif data.ndim == 1: + if data.ndim == 1: r_value = data else: # R uses column-major order like Fortran r_value = np.ravel(data, order="F") attributes = {"dim": np.array(data.shape)} - elif isinstance(data, (bool, int, float, complex)): + elif isinstance(data, (bool, int, float, complex, str, bytes)): return self.convert_to_r_object(np.array(data)) - elif isinstance(data, str): - return build_r_str(data, encoding=self.encoding) - - elif isinstance(data, bytes): - return build_r_char(data, encoding=self.encoding) - elif isinstance(data, range): if self.format_version < R_MINIMUM_VERSION_WITH_ALTREP: # ALTREP support is from R version 3.5.0 From de5a408d66bacc4d9795652bc6f6b694f523fbe8 Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Wed, 2 Oct 2024 12:14:54 +0300 Subject: [PATCH 075/100] Raise error for non-string dictionary keys --- rdata/conversion/to_r.py | 3 +++ rdata/tests/test_write.py | 7 +++++++ 2 files changed, 10 insertions(+) diff --git a/rdata/conversion/to_r.py b/rdata/conversion/to_r.py index b8a9c9d..1df39c6 100644 --- a/rdata/conversion/to_r.py +++ b/rdata/conversion/to_r.py @@ -510,6 +510,9 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 r_value = [self.convert_to_r_object(el) for el in values] if isinstance(data, dict): + if not all(isinstance(key, str) for key in data): + msg = "dictionary keys must be strings" + raise ValueError(msg) names = np.array(list(data.keys()), dtype=np.dtype("U")) attributes = {"names": names} diff --git a/rdata/tests/test_write.py b/rdata/tests/test_write.py index 8620d3a..a446779 100644 --- a/rdata/tests/test_write.py +++ b/rdata/tests/test_write.py @@ -206,6 +206,13 @@ def test_convert_to_r_unsupported_encoding() -> None: converter.convert_to_r_object("ä") +def test_convert_to_r_nonstr_dict_keys() -> None: + """Test checking non-string dict keys.""" + converter = ConverterFromPythonToR() + with pytest.raises(ValueError, match="(?i)keys must be strings"): + converter.convert_to_r_object({"a": 1, 2: 2}) + + @pytest.mark.parametrize("file_format", valid_formats) @pytest.mark.parametrize("value", [-2**31 - 1, 2**31]) def test_unparse_big_int(file_format: FileFormat, value: int) -> None: From 5d8187cb0deccbe73984241c936bfeb0e6512986 Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Wed, 2 Oct 2024 12:23:10 +0300 Subject: [PATCH 076/100] Raise error for non-string rda variable names --- rdata/conversion/to_r.py | 5 +++++ rdata/tests/test_write.py | 12 +++++++++--- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/rdata/conversion/to_r.py b/rdata/conversion/to_r.py index 1df39c6..aef8054 100644 --- a/rdata/conversion/to_r.py +++ b/rdata/conversion/to_r.py @@ -402,6 +402,9 @@ def convert_to_r_data(self, if not isinstance(data, dict): msg = f"for RDA file, data must be a dictionary, not type {type(data)}" raise TypeError(msg) + if not all(isinstance(key, str) for key in data): + msg = "for RDA file, dictionary keys must be strings" + raise ValueError(msg) r_object = self.convert_to_r_attributes(data) else: r_object = self.convert_to_r_object(data) @@ -453,6 +456,8 @@ def convert_to_r_sym(self, Returns: R object. """ + assert isinstance(name, str) + # Reference to existing symbol if exists if name in self._references: reference = self._references[name] diff --git a/rdata/tests/test_write.py b/rdata/tests/test_write.py index a446779..dd732a1 100644 --- a/rdata/tests/test_write.py +++ b/rdata/tests/test_write.py @@ -167,12 +167,18 @@ def test_convert_to_r(fname: str, expand_altrep: bool) -> None: # noqa: FBT001 assert data == out_data -def test_convert_to_r_bad_rda() -> None: +def test_convert_to_r_rda_missing_names() -> None: """Test checking that data for RDA has variable names.""" - py_data = "hello" converter = ConverterFromPythonToR() with pytest.raises(TypeError, match="(?i)data must be a dictionary"): - converter.convert_to_r_data(py_data, file_type="rda") + converter.convert_to_r_data("hello", file_type="rda") + + +def test_convert_to_r_rda_nonstr_names() -> None: + """Test checking that RDA variable names are strings.""" + converter = ConverterFromPythonToR() + with pytest.raises(ValueError, match="(?i)keys must be strings"): + converter.convert_to_r_data({1: "hello"}, file_type="rda") def test_convert_to_r_empty_rda() -> None: From 59f4269f5b1dd0f3a97985d6049d2a921f295a50 Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Wed, 2 Oct 2024 12:39:15 +0300 Subject: [PATCH 077/100] Use shorthand function --- rdata/tests/test_write.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/rdata/tests/test_write.py b/rdata/tests/test_write.py index dd732a1..000704c 100644 --- a/rdata/tests/test_write.py +++ b/rdata/tests/test_write.py @@ -12,7 +12,7 @@ import pytest import rdata -from rdata.conversion import ConverterFromPythonToR +from rdata.conversion import ConverterFromPythonToR, convert_python_to_r_object from rdata.unparser import unparse_data if TYPE_CHECKING: @@ -253,8 +253,8 @@ def test_convert_dataframe_pandas_dtypes() -> None: index=pd.RangeIndex(3), ) - r_obj1 = ConverterFromPythonToR().convert_to_r_object(df1) - r_obj2 = ConverterFromPythonToR().convert_to_r_object(df2) + r_obj1 = convert_python_to_r_object(df1) + r_obj2 = convert_python_to_r_object(df2) assert str(r_obj1) == str(r_obj2) assert r_obj1 == r_obj2 From 540a59ad8f7feac709d2925f9cedac354af4ed6d Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Wed, 2 Oct 2024 12:40:27 +0300 Subject: [PATCH 078/100] Add constructor_dict to helper functions --- rdata/_write.py | 15 ++++++++++++--- rdata/conversion/__init__.py | 2 ++ rdata/conversion/to_r.py | 17 +++++++++++------ 3 files changed, 25 insertions(+), 9 deletions(-) diff --git a/rdata/_write.py b/rdata/_write.py index b1a42e1..3a03128 100644 --- a/rdata/_write.py +++ b/rdata/_write.py @@ -3,15 +3,18 @@ from typing import TYPE_CHECKING -from .conversion import convert_python_to_r_data -from .conversion.to_r import DEFAULT_FORMAT_VERSION +from .conversion import ( + DEFAULT_CONSTRUCTOR_DICT, + DEFAULT_FORMAT_VERSION, + convert_python_to_r_data, +) from .unparser import unparse_file if TYPE_CHECKING: import os from typing import Any - from .conversion.to_r import Encoding + from .conversion.to_r import ConstructorDict, Encoding from .unparser import Compression, FileFormat @@ -23,6 +26,7 @@ def write_rds( compression: Compression = "gzip", encoding: Encoding = "utf-8", format_version: int = DEFAULT_FORMAT_VERSION, + constructor_dict: ConstructorDict = DEFAULT_CONSTRUCTOR_DICT, ) -> None: """ Write an RDS file. @@ -37,6 +41,7 @@ def write_rds( compression: Compression. encoding: Encoding to be used for strings within data. format_version: File format version. + constructor_dict: Dictionary mapping Python types to R classes. See Also: :func:`write_rda`: Similar function that writes an RDA or RDATA file. @@ -53,6 +58,7 @@ def write_rds( data, encoding=encoding, format_version=format_version, + constructor_dict=constructor_dict, ) unparse_file( @@ -72,6 +78,7 @@ def write_rda( compression: Compression = "gzip", encoding: Encoding = "utf-8", format_version: int = DEFAULT_FORMAT_VERSION, + constructor_dict: ConstructorDict = DEFAULT_CONSTRUCTOR_DICT, ) -> None: """ Write an RDA or RDATA file. @@ -86,6 +93,7 @@ def write_rda( compression: Compression. encoding: Encoding to be used for strings within data. format_version: File format version. + constructor_dict: Dictionary mapping Python types to R classes. See Also: :func:`write_rds`: Similar function that writes an RDS file. @@ -102,6 +110,7 @@ def write_rda( data, encoding=encoding, format_version=format_version, + constructor_dict=constructor_dict, file_type="rda", ) diff --git a/rdata/conversion/__init__.py b/rdata/conversion/__init__.py index 55506b3..3d3a699 100644 --- a/rdata/conversion/__init__.py +++ b/rdata/conversion/__init__.py @@ -25,6 +25,8 @@ ts_constructor as ts_constructor, ) from .to_r import ( + DEFAULT_CONSTRUCTOR_DICT as DEFAULT_CONSTRUCTOR_DICT, + DEFAULT_FORMAT_VERSION as DEFAULT_FORMAT_VERSION, ConverterFromPythonToR as ConverterFromPythonToR, convert_python_to_r_data as convert_python_to_r_data, convert_python_to_r_object as convert_python_to_r_object, diff --git a/rdata/conversion/to_r.py b/rdata/conversion/to_r.py index aef8054..d0eca0d 100644 --- a/rdata/conversion/to_r.py +++ b/rdata/conversion/to_r.py @@ -346,31 +346,30 @@ class ConverterFromPythonToR: Class converting Python objects to R objects. Attributes: - constructor_dict: Dictionary mapping Python types to R classes. encoding: Encoding to be used for strings within data. format_version: File format version. r_version_serialized: R version written as the creator of the object. + constructor_dict: Dictionary mapping Python types to R classes. """ - def __init__(self, - constructor_dict: ConstructorDict = DEFAULT_CONSTRUCTOR_DICT, - *, + def __init__(self, *, encoding: Encoding = "utf-8", format_version: int = DEFAULT_FORMAT_VERSION, r_version_serialized: int = DEFAULT_R_VERSION_SERIALIZED, + constructor_dict: ConstructorDict = DEFAULT_CONSTRUCTOR_DICT, ) -> None: """ Init class. Args: - constructor_dict: Dictionary mapping Python types to R classes. encoding: Encoding to be used for strings within data. format_version: File format version. r_version_serialized: R version written as the creator of the object. + constructor_dict: Dictionary mapping Python types to R classes. """ - self.constructor_dict = constructor_dict self.encoding = encoding self.format_version = format_version self.r_version_serialized = r_version_serialized + self.constructor_dict = constructor_dict self._references: dict[str | None, tuple[int, RObject | None]] \ = {None: (0, None)} @@ -631,6 +630,7 @@ def convert_python_to_r_data( encoding: Encoding = "utf-8", format_version: int = DEFAULT_FORMAT_VERSION, r_version_serialized: int = DEFAULT_R_VERSION_SERIALIZED, + constructor_dict: ConstructorDict = DEFAULT_CONSTRUCTOR_DICT, file_type: FileType = "rds", ) -> RData: """ @@ -641,6 +641,7 @@ def convert_python_to_r_data( encoding: Encoding to be used for strings within data. format_version: File format version. r_version_serialized: R version written as the creator of the object. + constructor_dict: Dictionary mapping Python types to R classes. file_type: File type. Returns: @@ -653,6 +654,7 @@ def convert_python_to_r_data( encoding=encoding, format_version=format_version, r_version_serialized=r_version_serialized, + constructor_dict=constructor_dict, ).convert_to_r_data(data, file_type=file_type) @@ -662,6 +664,7 @@ def convert_python_to_r_object( encoding: Encoding = "utf-8", format_version: int = DEFAULT_FORMAT_VERSION, r_version_serialized: int = DEFAULT_R_VERSION_SERIALIZED, + constructor_dict: ConstructorDict = DEFAULT_CONSTRUCTOR_DICT, ) -> RObject: """ Convert Python data to R object. @@ -671,6 +674,7 @@ def convert_python_to_r_object( encoding: Encoding to be used for strings within data. format_version: File format version. r_version_serialized: R version written as the creator of the object. + constructor_dict: Dictionary mapping Python types to R classes. Returns: Corresponding RObject object. @@ -682,4 +686,5 @@ def convert_python_to_r_object( encoding=encoding, format_version=format_version, r_version_serialized=r_version_serialized, + constructor_dict=constructor_dict, ).convert_to_r_object(data) From df8b3910fc73b43555e1cdb4a2302e5ec0986216 Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Fri, 25 Oct 2024 14:30:47 +0300 Subject: [PATCH 079/100] Recreate test files in common attribute order --- rdata/conversion/to_r.py | 10 ---------- rdata/tests/data/test_dataframe.rda | Bin 175 -> 176 bytes rdata/tests/data/test_dataframe.rds | Bin 152 -> 153 bytes rdata/tests/data/test_dataframe_v3.rda | Bin 186 -> 187 bytes rdata/tests/data/test_dataframe_v3.rds | Bin 164 -> 164 bytes rdata/tests/test_rdata.py | 6 ++++++ rdata/tests/test_write.py | 7 ------- 7 files changed, 6 insertions(+), 17 deletions(-) diff --git a/rdata/conversion/to_r.py b/rdata/conversion/to_r.py index d0eca0d..435ff69 100644 --- a/rdata/conversion/to_r.py +++ b/rdata/conversion/to_r.py @@ -373,11 +373,6 @@ def __init__(self, *, self._references: dict[str | None, tuple[int, RObject | None]] \ = {None: (0, None)} - # In test files the order in which dataframe attributes are written varies. - # R can read files with attributes in any order, but this variable - # is used in tests to change the attribute order to match with the test file. - self.df_attr_order: list[str] | None = None - def convert_to_r_data(self, data: Any, # noqa: ANN401 @@ -605,11 +600,6 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 msg = f"type {type(data)} not implemented" raise NotImplementedError(msg) - # Fix for test files where dataframe attribute order varies - assert isinstance(attributes, dict) - if isinstance(data, pd.DataFrame) and self.df_attr_order is not None: - attributes = {k: attributes[k] for k in self.df_attr_order} - if attributes is not None: is_object = "class" in attributes r_attributes = self.convert_to_r_attributes(attributes) diff --git a/rdata/tests/data/test_dataframe.rda b/rdata/tests/data/test_dataframe.rda index bd83517e593aa88d3253e5941d95aa3dd87dbc85..61cbf300f618bd1ad0c0f45fe8558a065ac3bfe7 100644 GIT binary patch literal 176 zcmV;h08jrPiwFP!0000016@xs4#F@DbmD;2NFXHs(SbMc0;X=rlynMIN<&%sI1O%^ zAT@*Svvt0+FX!f@8w~&wlt7TkP)SY#6jWUI-VFYFCr_QDhXKH2WCQNFVr0gCj@9PY zn$g&S(P7b#Q$ZQr3jU_GG-Uc;q@&jNj?u(XT4E{jSQN$39xzf#~l_1 zCUR)|3h%u(y_~y~>?8mPKn{TnYr(048XB&9r>wi)$E)7@4sT@u%Es zH7jiy9pdl$T2ThKg1_lC*-ZbL4Ai^PFj_dukyu(hdBt*K@=eW5dQyd~G2^Sv(lA53 dIeG6tD-nIMl??Z2(?Q~9%NOEo@hD0G0019OQYrud diff --git a/rdata/tests/data/test_dataframe.rds b/rdata/tests/data/test_dataframe.rds index b5f238242691b4ef5071cd1149776e91b592ae93..bdfbdba8fd5174cfb34d5be2c40b19212b28df10 100644 GIT binary patch literal 153 zcmV;K0A~LmiwFP!0000016_{64#FT1M3-GOX_F=<{zt#Z>ssjvZ9;3k`r{QBC_U_9 zV1_Jj_SOObf;9w4V(FSQP?v(pl$>NSXA6zoXYW?$XK|N8$ghU0xvTlp;CZg&2V~@7 znS!{VNLP|Zy!h#KD@TlROPl=x#0|t9 HBLM&aYk)~Q literal 152 zcmV;J0B8RniwFP!0000016@u*4#F@DOOt}MNt2NH$KJy=jP6vJ)QxfFTFoIjcmzBXT(*GtkR<>yg4HW4}o?1L<#dKou4MmUU pr6(D8$uQ;5EW({Lw|P2%iMorAW{t3U2) z+O2{;Bz-S^FA2Gvnmung05D($MNSZk<9SyfYJe;ymOut2y=x=1x!%!Wr?3}f*;17h zY~}q<>A8zPXCcKXq$afSzjHOg0B(^!Zzbv=|265bHG?8FQN$xz+~Pqi#uJym3mH*9 oMPw`0@T!8<%}C#rthJw6$ZWI~clW54gZRy+FC~#dXH)?I0G2XRrvLx| diff --git a/rdata/tests/data/test_dataframe_v3.rds b/rdata/tests/data/test_dataframe_v3.rds index 6c2ada71662ce503d8b8490e3154ae34c70ae192..8e2492d66cc9b8feba05da9e8f0ec44a3061bc9b 100644 GIT binary patch delta 73 zcmV-P0Ji_60i*$tG6Kdkku)?sO?vF6!4vvY@kqrjUih>;ar52Cf^N2XO$HWLKXAgP f_hiD1D?hxOVji!MA9_LvYrO0SYD{WRECB!j*sUMu delta 73 zcmV-P0Ji_60i*$tG6H5iku)?sm-N_8gD3Q*;*pB4c;VCX#BJ|J7Id>^w#pChrdVT! f{Lm9ZShG+Dc2)hr37g)NC0_Oe6q6JtECB!jCoCbk diff --git a/rdata/tests/test_rdata.py b/rdata/tests/test_rdata.py index 2ab8a4d..6972bcd 100644 --- a/rdata/tests/test_rdata.py +++ b/rdata/tests/test_rdata.py @@ -454,6 +454,9 @@ def test_encodings_v3(self) -> None: def test_dataframe(self) -> None: """Test dataframe conversion.""" + # Files created in R with + # test_dataframe = data.frame(class=factor(c("a", "b", "b")), value=c(1L, 2L, 3L)); save(test_dataframe, file="test_dataframe.rda", version=2) # noqa: E501 + # test_dataframe = data.frame(class=factor(c("a", "b", "b")), value=c(1L, 2L, 3L)); save(test_dataframe, file="test_dataframe_v3.rda") # noqa: E501 for f in ("test_dataframe.rda", "test_dataframe_v3.rda"): with self.subTest(file=f): data = rdata.read_rda(TESTDATA_PATH / f) @@ -476,6 +479,9 @@ def test_dataframe(self) -> None: def test_dataframe_rds(self) -> None: """Test dataframe conversion.""" + # Files created in R with + # df = data.frame(class=factor(c("a", "b", "b")), value=c(1L, 2L, 3L)); saveRDS(df, file="test_dataframe.rds", version=2) # noqa: E501 + # df = data.frame(class=factor(c("a", "b", "b")), value=c(1L, 2L, 3L)); saveRDS(df, file="test_dataframe_v3.rds") # noqa: E501 for f in ("test_dataframe.rds", "test_dataframe_v3.rds"): with self.subTest(file=f): data = rdata.read_rds(TESTDATA_PATH / f) diff --git a/rdata/tests/test_write.py b/rdata/tests/test_write.py index 000704c..96dc96c 100644 --- a/rdata/tests/test_write.py +++ b/rdata/tests/test_write.py @@ -138,13 +138,6 @@ def test_convert_to_r(fname: str, expand_altrep: bool) -> None: # noqa: FBT001 format_version=r_data.versions.format, r_version_serialized=r_data.versions.serialized, ) - if fname in [ - "test_dataframe.rda", - "test_dataframe.rds", - "test_dataframe_v3.rda", - "test_dataframe_v3.rds", - ]: - converter.df_attr_order = ["names", "row.names", "class"] try: new_r_data = converter.convert_to_r_data(py_data, file_type=file_type) From 1a00c1db63de5951ae929efc34a628fc7575309f Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Fri, 25 Oct 2024 14:46:50 +0300 Subject: [PATCH 080/100] Skip altreps with attributes in test --- rdata/tests/test_write.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/rdata/tests/test_write.py b/rdata/tests/test_write.py index 96dc96c..575f23e 100644 --- a/rdata/tests/test_write.py +++ b/rdata/tests/test_write.py @@ -108,10 +108,14 @@ def test_convert_to_r(fname: str, expand_altrep: bool) -> None: # noqa: FBT001 with (TESTDATA_PATH / fname).open("rb") as f: # Skip test files without unique R->py->R transformation if fname in [ - "test_encodings.rda", # encoding not kept in Python - "test_encodings_v3.rda", # encoding not kept in Python - "test_list_attrs.rda", # attributes not kept in Python - "test_file.rda", # attributes not kept in Python + # encoding not kept in Python + "test_encodings.rda", + "test_encodings_v3.rda", + # attributes not kept in Python + "test_list_attrs.rda", + "test_file.rda", + "test_altrep_wrap_real_attributes.rds", + "test_altrep_wrap_real_class_attribute.rds", ]: pytest.skip("ambiguous R->py->R transformation") From ff6b6a9bf2aef6727fabe10c0738aa6e7216f75d Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Fri, 25 Oct 2024 14:47:21 +0300 Subject: [PATCH 081/100] Fix ruff --- rdata/unparser/_ascii.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rdata/unparser/_ascii.py b/rdata/unparser/_ascii.py index 6bb694b..b9ce4f4 100644 --- a/rdata/unparser/_ascii.py +++ b/rdata/unparser/_ascii.py @@ -13,7 +13,7 @@ if TYPE_CHECKING: import io - from typing import Any, Final + from typing import Final import numpy.typing as npt From daf1e3a4cdbdc904612d70fd143bded24ac47c47 Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Fri, 25 Oct 2024 14:49:12 +0300 Subject: [PATCH 082/100] Filter expected warnings --- rdata/tests/test_rdata.py | 1 + 1 file changed, 1 insertion(+) diff --git a/rdata/tests/test_rdata.py b/rdata/tests/test_rdata.py index 6972bcd..d06df71 100644 --- a/rdata/tests/test_rdata.py +++ b/rdata/tests/test_rdata.py @@ -808,6 +808,7 @@ def test_altrep_wrap_real_attributes(self) -> None: data = rdata.conversion.convert(parsed) np.testing.assert_equal(data, [1., 2., 3.]) + @pytest.mark.filterwarnings("ignore:Missing constructor") def test_altrep_wrap_real_class_attribute(self) -> None: """Test alternative representation of wrap_real with class attribute.""" # File created in R with From 943e69747fbede54cd3e7398c4f7689fe7e95bea Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Fri, 25 Oct 2024 15:13:25 +0300 Subject: [PATCH 083/100] Pass converter object to constructor functions --- rdata/conversion/to_r.py | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/rdata/conversion/to_r.py b/rdata/conversion/to_r.py index 435ff69..c2cb188 100644 --- a/rdata/conversion/to_r.py +++ b/rdata/conversion/to_r.py @@ -35,19 +35,21 @@ Encoding = Literal["utf-8", "cp1252"] - class ConversionFunction(Protocol): - """Protocol for Py-to-R conversion function.""" - - def __call__(self, data: Any) -> RObject: # noqa: ANN401 - """Convert Python object to R object.""" + class Converter(Protocol): + """Protocol for class converting Python objects to R objects.""" + def convert_to_r_object( + self, + data: Any, # noqa: ANN401 + ) -> RObject: + """Convert Python data to R object.""" class ConstructorFunction(Protocol): """Protocol for Py-to-R constructor function.""" def __call__(self, data: Any, # noqa: ANN401 - convert_to_r_object: ConversionFunction, + converter: Converter, ) -> tuple[RObjectType, Any, dict[str, Any]]: """Convert Python object to R object components.""" @@ -69,14 +71,14 @@ def __call__(self, def categorical_constructor( data: pd.Categorical, - convert_to_r_object: ConversionFunction, # noqa: ARG001 + converter: Converter, # noqa: ARG001 ) -> tuple[RObjectType, Any, dict[str, Any]]: """ Construct R object components from pandas categorical. Args: data: Pandas categorical. - convert_to_r_object: Conversion function. + converter: Python-to-R converter. Returns: Components of the R object. @@ -93,14 +95,14 @@ def categorical_constructor( def dataframe_constructor( data: pd.DataFrame, - convert_to_r_object: ConversionFunction, + converter: Converter, ) -> tuple[RObjectType, Any, dict[str, Any]]: """ Construct R object components from pandas dataframe. Args: data: Pandas dataframe. - convert_to_r_object: Conversion function. + converter: Python-to-R converter. Returns: Components of the R object. @@ -119,7 +121,7 @@ def dataframe_constructor( array = pd_array else: array = convert_pd_array_to_np_array(pd_array) - r_series = convert_to_r_object(array) + r_series = converter.convert_to_r_object(array) r_value.append(r_series) index = data.index @@ -592,8 +594,7 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 # Check available constructors for t, constructor in self.constructor_dict.items(): if isinstance(data, t): - r_type, r_value, attributes \ - = constructor(data, self.convert_to_r_object) + r_type, r_value, attributes = constructor(data, self) break if r_type is None: From 8a269ae91c0da35f92846ee62211fc74931f5c5f Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Fri, 25 Oct 2024 15:34:26 +0300 Subject: [PATCH 084/100] Allow constructor functions without converter --- rdata/conversion/to_r.py | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/rdata/conversion/to_r.py b/rdata/conversion/to_r.py index c2cb188..06f141c 100644 --- a/rdata/conversion/to_r.py +++ b/rdata/conversion/to_r.py @@ -2,6 +2,7 @@ from __future__ import annotations +import inspect import string from types import MappingProxyType from typing import TYPE_CHECKING @@ -26,7 +27,7 @@ ) if TYPE_CHECKING: - from collections.abc import Mapping + from collections.abc import Callable, Mapping from typing import Any, Final, Literal, Protocol import numpy.typing as npt @@ -44,16 +45,11 @@ def convert_to_r_object( ) -> RObject: """Convert Python data to R object.""" - class ConstructorFunction(Protocol): - """Protocol for Py-to-R constructor function.""" + ConstructorReturnValue = tuple[RObjectType, Any, dict[str, Any]] + ConstructorFunction1 = Callable[[Any], ConstructorReturnValue] + ConstructorFunction2 = Callable[[Any, Converter], ConstructorReturnValue] - def __call__(self, - data: Any, # noqa: ANN401 - converter: Converter, - ) -> tuple[RObjectType, Any, dict[str, Any]]: - """Convert Python object to R object components.""" - - ConstructorDict = Mapping[type, ConstructorFunction] + ConstructorDict = Mapping[type, ConstructorFunction1 | ConstructorFunction2] # Default values for RVersions object @@ -71,14 +67,12 @@ def __call__(self, def categorical_constructor( data: pd.Categorical, - converter: Converter, # noqa: ARG001 -) -> tuple[RObjectType, Any, dict[str, Any]]: +) -> ConstructorReturnValue: """ Construct R object components from pandas categorical. Args: data: Pandas categorical. - converter: Python-to-R converter. Returns: Components of the R object. @@ -96,7 +90,7 @@ def categorical_constructor( def dataframe_constructor( data: pd.DataFrame, converter: Converter, -) -> tuple[RObjectType, Any, dict[str, Any]]: +) -> ConstructorReturnValue: """ Construct R object components from pandas dataframe. @@ -594,7 +588,16 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 # Check available constructors for t, constructor in self.constructor_dict.items(): if isinstance(data, t): - r_type, r_value, attributes = constructor(data, self) + n_params = len(inspect.signature(constructor).parameters) + args: tuple[Any] | tuple[Any, Converter] + if n_params == 1: + args = (data,) + elif n_params == 2: # noqa: PLR2004 + args = (data, self) + else: + msg = "constructor function has wrong call signature" + raise ValueError(msg) + r_type, r_value, attributes = constructor(*args) break if r_type is None: From 9718161022b8f6946d635138040051fee962c133 Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Mon, 28 Oct 2024 08:48:36 +0300 Subject: [PATCH 085/100] Convert only pandas rangeindex to altrep --- rdata/conversion/to_r.py | 84 +++++++++++++++++++++++++--------------- 1 file changed, 52 insertions(+), 32 deletions(-) diff --git a/rdata/conversion/to_r.py b/rdata/conversion/to_r.py index 06f141c..335eb6f 100644 --- a/rdata/conversion/to_r.py +++ b/rdata/conversion/to_r.py @@ -38,14 +38,15 @@ class Converter(Protocol): """Protocol for class converting Python objects to R objects.""" + format_version: int - def convert_to_r_object( - self, - data: Any, # noqa: ANN401 - ) -> RObject: + def convert_to_r_sym(self, name: str) -> RObject: + """Convert string to R symbol.""" + + def convert_to_r_object(self, data: Any) -> RObject: # noqa: ANN401 """Convert Python data to R object.""" - ConstructorReturnValue = tuple[RObjectType, Any, dict[str, Any]] + ConstructorReturnValue = tuple[RObjectType, Any, dict[str, Any] | None] ConstructorFunction1 = Callable[[Any], ConstructorReturnValue] ConstructorFunction2 = Callable[[Any, Converter], ConstructorReturnValue] @@ -131,7 +132,7 @@ def dataframe_constructor( fill_value=R_INT_NA, ) else: - row_names = range(index.start, index.stop, index.step) + row_names = index elif isinstance(index, pd.Index): if (index.dtype == "object" or np.issubdtype(str(index.dtype), np.integer)): @@ -151,9 +152,54 @@ def dataframe_constructor( return r_type, r_value, attributes +def rangeindex_constructor( + data: pd.RangeIndex, + converter: Converter, +) -> ConstructorReturnValue: + """ + Construct R object components from pandas rangeindex. + + Args: + data: Pandas rangeindex. + converter: Python-to-R converter. + + Returns: + Components of the R object. + """ + assert isinstance(data, pd.RangeIndex) + if converter.format_version < R_MINIMUM_VERSION_WITH_ALTREP: + # ALTREP support is from R version 3.5.0 + # (minimum version for format version 3) + return RObjectType.INT, np.array(data), None + + assert isinstance(data.step, int) + if data.step != 1: + # R supports compact sequences only with step 1; + # convert the range to an array of values + return RObjectType.INT, np.array(data), None + + r_type = RObjectType.ALTREP + r_value = ( + build_r_list([ + converter.convert_to_r_sym("compact_intseq"), + converter.convert_to_r_sym("base"), + converter.convert_to_r_object(RObjectType.INT.value), + ]), + converter.convert_to_r_object(np.array([ + len(data), + data.start, + data.step, + ], dtype=float)), + converter.convert_to_r_object(None), + ) + attributes = None + return r_type, r_value, attributes + + DEFAULT_CONSTRUCTOR_DICT: Final[ConstructorDict] = MappingProxyType({ pd.Categorical: categorical_constructor, pd.DataFrame: dataframe_constructor, + pd.RangeIndex: rangeindex_constructor, }) @@ -558,32 +604,6 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 elif isinstance(data, (bool, int, float, complex, str, bytes)): return self.convert_to_r_object(np.array(data)) - elif isinstance(data, range): - if self.format_version < R_MINIMUM_VERSION_WITH_ALTREP: - # ALTREP support is from R version 3.5.0 - # (minimum version for format version 3) - return self.convert_to_r_object(np.array(data)) - - if data.step != 1: - # R supports compact sequences only with step 1; - # convert the range to an array of values - return self.convert_to_r_object(np.array(data)) - - r_type = RObjectType.ALTREP - r_value = ( - build_r_list([ - self.convert_to_r_sym("compact_intseq"), - self.convert_to_r_sym("base"), - self.convert_to_r_object(RObjectType.INT.value), - ]), - self.convert_to_r_object(np.array([ - len(data), - data.start, - data.step, - ], dtype=float)), - self.convert_to_r_object(None), - ) - else: # Check available constructors for t, constructor in self.constructor_dict.items(): From a7c7066a8e65e58c4f406039ba7641dd469681f9 Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Mon, 28 Oct 2024 09:08:16 +0300 Subject: [PATCH 086/100] Use more robust indexing --- rdata/tests/test_write.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/rdata/tests/test_write.py b/rdata/tests/test_write.py index 575f23e..6e543a8 100644 --- a/rdata/tests/test_write.py +++ b/rdata/tests/test_write.py @@ -239,15 +239,16 @@ def test_convert_dataframe_pandas_dtypes() -> None: index=range(3), ) + index = pd.RangeIndex(3) df2 = pd.DataFrame( { - "int": pd.Series([10, 20, 30], dtype=pd.Int32Dtype()), - "float": pd.Series([1.1, 2.2, 3.3], dtype=pd.Float64Dtype()), - "string": pd.Series(["x" ,"y", "z"], dtype=pd.StringDtype()), - "bool": pd.Series([True, False, True], dtype=pd.BooleanDtype()), - "complex": pd.Series([4+5j, 6+7j, 8+9j], dtype=complex), + "int": pd.Series([10, 20, 30], dtype=pd.Int32Dtype(), index=index), + "float": pd.Series([1.1, 2.2, 3.3], dtype=pd.Float64Dtype(), index=index), + "string": pd.Series(["x" ,"y", "z"], dtype=pd.StringDtype(), index=index), + "bool": pd.Series([1, 0, 1], dtype=pd.BooleanDtype(), index=index), + "complex": pd.Series([4+5j, 6+7j, 8+9j], dtype=complex, index=index), }, - index=pd.RangeIndex(3), + index=index, ) r_obj1 = convert_python_to_r_object(df1) From 5a430aaf09461f3a2596f9f615bbeda70c7b97a3 Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Mon, 28 Oct 2024 09:08:37 +0300 Subject: [PATCH 087/100] Add tests for rangeindex --- rdata/parser/_parser.py | 3 +++ rdata/tests/test_write.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/rdata/parser/_parser.py b/rdata/parser/_parser.py index dbf2cc5..8c90486 100644 --- a/rdata/parser/_parser.py +++ b/rdata/parser/_parser.py @@ -369,6 +369,9 @@ def __eq__(self, other: object) -> bool: return False # Compare value field + if not isinstance(other.value, type(self.value)): + return False + if isinstance(self.value, np.ndarray): if not np.array_equal(self.value, other.value, equal_nan=True): return False diff --git a/rdata/tests/test_write.py b/rdata/tests/test_write.py index 6e543a8..9034055 100644 --- a/rdata/tests/test_write.py +++ b/rdata/tests/test_write.py @@ -258,6 +258,34 @@ def test_convert_dataframe_pandas_dtypes() -> None: assert r_obj1 == r_obj2 +def test_convert_dataframe_rangeindex() -> None: + """Test converting dataframe with rangeindex.""" + data = {"data": np.array([10, 20, 30], dtype=np.int32)} + + df1 = pd.DataFrame(data, index=pd.RangeIndex(3)) + df2 = pd.DataFrame(data, index=pd.Index([0, 1, 2])) + + r_obj1 = convert_python_to_r_object(df1) + r_obj2 = convert_python_to_r_object(df2) + + assert str(r_obj1) != str(r_obj2) + assert r_obj1 != r_obj2 + + +def test_convert_dataframe_rangeindex_flattened() -> None: + """Test converting dataframe with rangeindex.""" + data = {"data": np.array([10, 20, 30], dtype=np.int32)} + + df1 = pd.DataFrame(data, index=pd.RangeIndex(3, 8, 2)) + df2 = pd.DataFrame(data, index=pd.Index([3, 5, 7])) + + r_obj1 = convert_python_to_r_object(df1) + r_obj2 = convert_python_to_r_object(df2) + + assert str(r_obj1) == str(r_obj2) + assert r_obj1 == r_obj2 + + @pytest.mark.parametrize("compression", [*valid_compressions, "fail"]) @pytest.mark.parametrize("file_format", [*valid_formats, None, "fail"]) @pytest.mark.parametrize("file_type", ["rds", "rda"]) From 87f4c652a5b7f33895164d4810bd5ccd188bf589 Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Mon, 28 Oct 2024 09:14:37 +0300 Subject: [PATCH 088/100] Remove conversion of altrep to range --- rdata/conversion/_conversion.py | 37 --------------------------------- 1 file changed, 37 deletions(-) diff --git a/rdata/conversion/_conversion.py b/rdata/conversion/_conversion.py index 8530b8a..dcd5305 100644 --- a/rdata/conversion/_conversion.py +++ b/rdata/conversion/_conversion.py @@ -14,8 +14,6 @@ import xarray from typing_extensions import override -from rdata.parser._parser import get_altrep_name - from .. import parser ConversionFunction = Callable[[Union[parser.RData, parser.RObject]], Any] @@ -396,38 +394,6 @@ def convert_array( return value # type: ignore [no-any-return] -def convert_altrep_to_range( - r_altrep: parser.RObject, -) -> range: - """ - Convert a R altrep to range object. - - Args: - r_altrep: R altrep object - - Returns: - Range object. - """ - if r_altrep.info.type != parser.RObjectType.ALTREP: - msg = "Must receive an altrep object" - raise TypeError(msg) - - info, state, attr = r_altrep.value - assert attr.info.type == parser.RObjectType.NILVALUE - - altrep_name = get_altrep_name(info) - - if altrep_name != b"compact_intseq": - msg = "Only compact integer sequences can be converted to range" - raise NotImplementedError(msg) - - n = int(state.value[0]) - start = int(state.value[1]) - step = int(state.value[2]) - stop = start + (n - 1) * step - return range(start, stop + 1, step) - - def _dataframe_column_transform(source: Any) -> Any: # noqa: ANN401 if isinstance(source, np.ndarray): @@ -872,9 +838,6 @@ def _convert_next( # noqa: C901, PLR0912, PLR0915 value = None - elif obj.info.type == parser.RObjectType.ALTREP: - value = convert_altrep_to_range(obj) - else: msg = f"Type {obj.info.type} not implemented" raise NotImplementedError(msg) From 25a14af66554caec8fb4d888e8667f0994563bc2 Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Mon, 28 Oct 2024 11:00:52 +0200 Subject: [PATCH 089/100] Clarify skip message --- rdata/tests/test_write.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rdata/tests/test_write.py b/rdata/tests/test_write.py index 9034055..e733518 100644 --- a/rdata/tests/test_write.py +++ b/rdata/tests/test_write.py @@ -106,7 +106,7 @@ def test_unparse(fname: str) -> None: def test_convert_to_r(fname: str, expand_altrep: bool) -> None: # noqa: FBT001 """Test converting Python data to RData object.""" with (TESTDATA_PATH / fname).open("rb") as f: - # Skip test files without unique R->py->R transformation + # Skip test files without unique transformation if fname in [ # encoding not kept in Python "test_encodings.rda", @@ -117,7 +117,7 @@ def test_convert_to_r(fname: str, expand_altrep: bool) -> None: # noqa: FBT001 "test_altrep_wrap_real_attributes.rds", "test_altrep_wrap_real_class_attribute.rds", ]: - pytest.skip("ambiguous R->py->R transformation") + pytest.skip("ambiguous R-to-Python-to-R transformation") data = decompress_data(f.read()) file_type, file_format = parse_file_type_and_format(data) From 798408927426ee0a7269f5a84d43a381fd62ccd1 Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Mon, 28 Oct 2024 12:29:18 +0200 Subject: [PATCH 090/100] Fix ruff formatting --- rdata/_write.py | 1 + rdata/conversion/__init__.py | 1 + rdata/conversion/to_r.py | 67 +++++++++++++++++++----------------- rdata/parser/_ascii.py | 7 ++-- rdata/parser/_xdr.py | 6 ++-- rdata/unparser/__init__.py | 40 +++++++++++---------- rdata/unparser/_ascii.py | 5 +-- rdata/unparser/_unparser.py | 3 +- rdata/unparser/_xdr.py | 3 +- 9 files changed, 72 insertions(+), 61 deletions(-) diff --git a/rdata/_write.py b/rdata/_write.py index 3a03128..a534af0 100644 --- a/rdata/_write.py +++ b/rdata/_write.py @@ -1,4 +1,5 @@ """Functions to perform conversion and unparsing in one step.""" + from __future__ import annotations from typing import TYPE_CHECKING diff --git a/rdata/conversion/__init__.py b/rdata/conversion/__init__.py index 3d3a699..ab13caa 100644 --- a/rdata/conversion/__init__.py +++ b/rdata/conversion/__init__.py @@ -1,4 +1,5 @@ """Utilities for converting R objects to Python ones.""" + from ._conversion import ( DEFAULT_CLASS_MAP as DEFAULT_CLASS_MAP, Converter as Converter, diff --git a/rdata/conversion/to_r.py b/rdata/conversion/to_r.py index 335eb6f..7ecf7f5 100644 --- a/rdata/conversion/to_r.py +++ b/rdata/conversion/to_r.py @@ -38,6 +38,7 @@ class Converter(Protocol): """Protocol for class converting Python objects to R objects.""" + format_version: int def convert_to_r_sym(self, name: str) -> RObject: @@ -122,10 +123,7 @@ def dataframe_constructor( index = data.index if isinstance(index, pd.RangeIndex): assert isinstance(index.start, int) - if (index.start == 1 - and index.stop == data.shape[0] + 1 - and index.step == 1 - ): + if index.start == 1 and index.stop == data.shape[0] + 1 and index.step == 1: row_names = np.ma.array( data=[R_INT_NA, -data.shape[0]], mask=[True, False], @@ -134,8 +132,7 @@ def dataframe_constructor( else: row_names = index elif isinstance(index, pd.Index): - if (index.dtype == "object" - or np.issubdtype(str(index.dtype), np.integer)): + if index.dtype == "object" or np.issubdtype(str(index.dtype), np.integer): row_names = index.to_numpy() else: msg = f"pd.DataFrame pd.Index {index.dtype} not implemented" @@ -296,10 +293,11 @@ def build_r_object( """ assert r_type is not None reference_id, referenced_object = reference - assert ((reference_id == 0) - == (referenced_object is None) - == (r_type != RObjectType.REF) - ) + assert ( + (reference_id == 0) + == (referenced_object is None) + == (r_type != RObjectType.REF) + ) return RObject( RObjectInfo( r_type, @@ -308,12 +306,12 @@ def build_r_object( tag=tag is not None, gp=gp, reference=reference_id, - ), - value, - attributes, - tag, - referenced_object, - ) + ), + value, + attributes, + tag, + referenced_object, + ) def build_r_list( @@ -393,7 +391,10 @@ class ConverterFromPythonToR: r_version_serialized: R version written as the creator of the object. constructor_dict: Dictionary mapping Python types to R classes. """ - def __init__(self, *, + + def __init__( + self, + *, encoding: Encoding = "utf-8", format_version: int = DEFAULT_FORMAT_VERSION, r_version_serialized: int = DEFAULT_R_VERSION_SERIALIZED, @@ -412,11 +413,12 @@ def __init__(self, *, self.format_version = format_version self.r_version_serialized = r_version_serialized self.constructor_dict = constructor_dict - self._references: dict[str | None, tuple[int, RObject | None]] \ - = {None: (0, None)} + self._references: dict[str | None, tuple[int, RObject | None]] = { + None: (0, None), + } - - def convert_to_r_data(self, + def convert_to_r_data( + self, data: Any, # noqa: ANN401 *, file_type: FileType = "rds", @@ -451,14 +453,16 @@ def convert_to_r_data(self, R_MINIMUM_VERSIONS[self.format_version], ) - extra = (RExtraInfo(self.encoding.upper()) - if versions.format >= R_MINIMUM_VERSION_WITH_ENCODING - else RExtraInfo(None)) + extra = ( + RExtraInfo(self.encoding.upper()) + if versions.format >= R_MINIMUM_VERSION_WITH_ENCODING + else RExtraInfo(None) + ) return RData(versions, extra, r_object) - - def convert_to_r_attributes(self, + def convert_to_r_attributes( + self, data: dict[str, Any], ) -> RObject: """ @@ -479,8 +483,8 @@ def convert_to_r_attributes(self, return build_r_list(converted) - - def convert_to_r_sym(self, + def convert_to_r_sym( + self, name: str, ) -> RObject: """ @@ -507,8 +511,8 @@ def convert_to_r_sym(self, self._references[name] = (len(self._references), r_object) return r_object - - def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 + def convert_to_r_object( # noqa: C901, PLR0912, PLR0915 + self, data: Any, # noqa: ANN401 ) -> RObject: """ @@ -583,8 +587,7 @@ def convert_to_r_object(self, # noqa: C901, PLR0912, PLR0915 elif data.dtype.kind in ["U"]: assert data.ndim == 1 r_type = RObjectType.STR - r_value = [build_r_char(el, encoding=self.encoding) - for el in data] + r_value = [build_r_char(el, encoding=self.encoding) for el in data] else: r_type = { diff --git a/rdata/parser/_ascii.py b/rdata/parser/_ascii.py index 23d22b1..a79d2ba 100644 --- a/rdata/parser/_ascii.py +++ b/rdata/parser/_ascii.py @@ -40,11 +40,10 @@ def _readline(self) -> str: return self.file.readline()[:-1] def _parse_array_values( - self, - dtype: npt.DTypeLike, - length: int, + self, + dtype: npt.DTypeLike, + length: int, ) -> npt.NDArray[Any]: - array = np.empty(length, dtype=dtype) value: int | float | complex diff --git a/rdata/parser/_xdr.py b/rdata/parser/_xdr.py index 6d265dd..fe5211a 100644 --- a/rdata/parser/_xdr.py +++ b/rdata/parser/_xdr.py @@ -26,9 +26,9 @@ def __init__( self.file = io.BytesIO(data) def _parse_array_values( - self, - dtype: npt.DTypeLike, - length: int, + self, + dtype: npt.DTypeLike, + length: int, ) -> npt.NDArray[Any]: dtype = np.dtype(dtype) buffer = self.file.read(length * dtype.itemsize) diff --git a/rdata/unparser/__init__.py b/rdata/unparser/__init__.py index 0fdc243..02a189e 100644 --- a/rdata/unparser/__init__.py +++ b/rdata/unparser/__init__.py @@ -25,12 +25,12 @@ def unparse_file( - path: os.PathLike[Any] | str, - r_data: RData, - *, - file_format: FileFormat = "xdr", - file_type: FileType = "rds", - compression: Compression = "gzip", + path: os.PathLike[Any] | str, + r_data: RData, + *, + file_format: FileFormat = "xdr", + file_type: FileType = "rds", + compression: Compression = "gzip", ) -> None: """ Unparse RData object to a file. @@ -59,11 +59,11 @@ def unparse_file( def unparse_fileobj( - fileobj: IO[Any], - r_data: RData, - *, - file_format: FileFormat = "xdr", - file_type: FileType = "rds", + fileobj: IO[Any], + r_data: RData, + *, + file_format: FileFormat = "xdr", + file_type: FileType = "rds", ) -> None: """ Unparse RData object to a file object. @@ -78,9 +78,11 @@ def unparse_fileobj( if file_format == "ascii": from ._ascii import UnparserASCII as Unparser + rda_magic = "RDA" elif file_format == "xdr": from ._xdr import UnparserXDR as Unparser + rda_magic = "RDX" else: msg = f"Unknown file format: {file_format}" @@ -89,9 +91,11 @@ def unparse_fileobj( # Check that RData object for rda file is of correct kind if file_type == "rda": r_object = r_data.object - if not (r_object.info.type is RObjectType.LIST - and r_object.tag is not None - and r_object.tag.info.type is RObjectType.SYM): + if not ( + r_object.info.type is RObjectType.LIST + and r_object.tag is not None + and r_object.tag.info.type is RObjectType.SYM + ): msg = "r_data object must be dictionary-like for rda file" raise ValueError(msg) @@ -104,10 +108,10 @@ def unparse_fileobj( def unparse_data( - r_data: RData, - *, - file_format: FileFormat = "xdr", - file_type: FileType = "rds", + r_data: RData, + *, + file_format: FileFormat = "xdr", + file_type: FileType = "rds", ) -> bytes: """ Unparse RData object to a bytestring. diff --git a/rdata/unparser/_ascii.py b/rdata/unparser/_ascii.py index b9ce4f4..0881631 100644 --- a/rdata/unparser/_ascii.py +++ b/rdata/unparser/_ascii.py @@ -35,7 +35,7 @@ def escape(b: bytes) -> str: byte_to_str[byte] = escape(bytes([byte])) # Update mapping for special characters - byte_to_str[b'"'[0]] = r'\"' + byte_to_str[b'"'[0]] = r"\"" byte_to_str[b"'"[0]] = r"\'" byte_to_str[b"?"[0]] = r"\?" byte_to_str[b" "[0]] = r"\040" @@ -68,7 +68,8 @@ def unparse_magic(self) -> None: """Unparse magic bits.""" self._add_line("A") - def _unparse_array_values_raw(self, + def _unparse_array_values_raw( + self, array: npt.NDArray[np.int32 | np.float64 | np.complex128], ) -> None: # Convert complex to pairs of floats diff --git a/rdata/unparser/_unparser.py b/rdata/unparser/_unparser.py index 6079e89..b17630e 100644 --- a/rdata/unparser/_unparser.py +++ b/rdata/unparser/_unparser.py @@ -94,7 +94,8 @@ def _unparse_array_values(self, array: npt.NDArray[Any]) -> None: self._unparse_array_values_raw(array) @abc.abstractmethod - def _unparse_array_values_raw(self, + def _unparse_array_values_raw( + self, array: npt.NDArray[np.int32 | np.float64 | np.complex128], ) -> None: """Unparse the values of an array as such.""" diff --git a/rdata/unparser/_xdr.py b/rdata/unparser/_xdr.py index 0c6d3fc..255c182 100644 --- a/rdata/unparser/_xdr.py +++ b/rdata/unparser/_xdr.py @@ -27,7 +27,8 @@ def unparse_magic(self) -> None: """Unparse magic bits.""" self.file.write(b"X\n") - def _unparse_array_values_raw(self, + def _unparse_array_values_raw( + self, array: npt.NDArray[np.int32 | np.float64 | np.complex128], ) -> None: # Convert to big endian if needed From 9570204680b62608ea79ce636e8a007a57baf9fe Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Wed, 22 Jan 2025 14:25:34 +0200 Subject: [PATCH 091/100] Fix docstring --- rdata/conversion/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rdata/conversion/__init__.py b/rdata/conversion/__init__.py index ab13caa..c5855df 100644 --- a/rdata/conversion/__init__.py +++ b/rdata/conversion/__init__.py @@ -1,4 +1,4 @@ -"""Utilities for converting R objects to Python ones.""" +"""Utilities for converting between R and Python objects.""" from ._conversion import ( DEFAULT_CLASS_MAP as DEFAULT_CLASS_MAP, From b057ee18cebd814f405b08792f17bb8d1645a6bd Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Wed, 22 Jan 2025 14:35:58 +0200 Subject: [PATCH 092/100] Include converter always in constructor functions This reverts commit 8a269ae91c0da35f92846ee62211fc74931f5c5f. --- rdata/conversion/to_r.py | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/rdata/conversion/to_r.py b/rdata/conversion/to_r.py index 7ecf7f5..62e5d6e 100644 --- a/rdata/conversion/to_r.py +++ b/rdata/conversion/to_r.py @@ -2,7 +2,6 @@ from __future__ import annotations -import inspect import string from types import MappingProxyType from typing import TYPE_CHECKING @@ -48,10 +47,8 @@ def convert_to_r_object(self, data: Any) -> RObject: # noqa: ANN401 """Convert Python data to R object.""" ConstructorReturnValue = tuple[RObjectType, Any, dict[str, Any] | None] - ConstructorFunction1 = Callable[[Any], ConstructorReturnValue] - ConstructorFunction2 = Callable[[Any, Converter], ConstructorReturnValue] - - ConstructorDict = Mapping[type, ConstructorFunction1 | ConstructorFunction2] + ConstructorFunction = Callable[[Any, Converter], ConstructorReturnValue] + ConstructorDict = Mapping[type, ConstructorFunction] # Default values for RVersions object @@ -69,12 +66,14 @@ def convert_to_r_object(self, data: Any) -> RObject: # noqa: ANN401 def categorical_constructor( data: pd.Categorical, + converter: Converter, # noqa: ARG001 ) -> ConstructorReturnValue: """ Construct R object components from pandas categorical. Args: data: Pandas categorical. + converter: Python-to-R converter. Returns: Components of the R object. @@ -611,16 +610,7 @@ def convert_to_r_object( # noqa: C901, PLR0912, PLR0915 # Check available constructors for t, constructor in self.constructor_dict.items(): if isinstance(data, t): - n_params = len(inspect.signature(constructor).parameters) - args: tuple[Any] | tuple[Any, Converter] - if n_params == 1: - args = (data,) - elif n_params == 2: # noqa: PLR2004 - args = (data, self) - else: - msg = "constructor function has wrong call signature" - raise ValueError(msg) - r_type, r_value, attributes = constructor(*args) + r_type, r_value, attributes = constructor(data, self) break if r_type is None: From fb76598967117def95e004bcc865df6058aeef57 Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Wed, 22 Jan 2025 15:01:56 +0200 Subject: [PATCH 093/100] Return R object from constructors --- rdata/conversion/to_r.py | 67 +++++++++++++++++++++++++--------------- 1 file changed, 42 insertions(+), 25 deletions(-) diff --git a/rdata/conversion/to_r.py b/rdata/conversion/to_r.py index 62e5d6e..ee80ae6 100644 --- a/rdata/conversion/to_r.py +++ b/rdata/conversion/to_r.py @@ -40,14 +40,16 @@ class Converter(Protocol): format_version: int + def convert_to_r_attributes(self, data: dict[str, Any]) -> RObject: + """Convert dictionary to R attributes list.""" + def convert_to_r_sym(self, name: str) -> RObject: """Convert string to R symbol.""" def convert_to_r_object(self, data: Any) -> RObject: # noqa: ANN401 """Convert Python data to R object.""" - ConstructorReturnValue = tuple[RObjectType, Any, dict[str, Any] | None] - ConstructorFunction = Callable[[Any, Converter], ConstructorReturnValue] + ConstructorFunction = Callable[[Any, Converter], RObject] ConstructorDict = Mapping[type, ConstructorFunction] @@ -66,8 +68,8 @@ def convert_to_r_object(self, data: Any) -> RObject: # noqa: ANN401 def categorical_constructor( data: pd.Categorical, - converter: Converter, # noqa: ARG001 -) -> ConstructorReturnValue: + converter: Converter, +) -> RObject: """ Construct R object components from pandas categorical. @@ -79,19 +81,23 @@ def categorical_constructor( Components of the R object. """ assert isinstance(data, pd.Categorical) - r_type = RObjectType.INT - r_value = data.codes + 1 - attributes = { + r_attributes = converter.convert_to_r_attributes({ "levels": data.categories.to_numpy(), "class": "factor", - } - return r_type, r_value, attributes + }) + + return build_r_object( + RObjectType.INT, + value=data.codes + 1, + is_object=True, + attributes=r_attributes, + ) def dataframe_constructor( data: pd.DataFrame, converter: Converter, -) -> ConstructorReturnValue: +) -> RObject: """ Construct R object components from pandas dataframe. @@ -103,7 +109,6 @@ def dataframe_constructor( Components of the R object. """ assert isinstance(data, pd.DataFrame) - r_type = RObjectType.VEC column_names = [] r_value = [] for column, series in data.items(): @@ -140,18 +145,24 @@ def dataframe_constructor( msg = f"pd.DataFrame index {type(index)} not implemented" raise NotImplementedError(msg) - attributes = { + r_attributes = converter.convert_to_r_attributes({ "names": np.array(column_names, dtype=np.dtype("U")), "class": "data.frame", "row.names": row_names, - } - return r_type, r_value, attributes + }) + + return build_r_object( + RObjectType.VEC, + value=r_value, + is_object=True, + attributes=r_attributes, + ) def rangeindex_constructor( data: pd.RangeIndex, converter: Converter, -) -> ConstructorReturnValue: +) -> RObject: """ Construct R object components from pandas rangeindex. @@ -166,15 +177,20 @@ def rangeindex_constructor( if converter.format_version < R_MINIMUM_VERSION_WITH_ALTREP: # ALTREP support is from R version 3.5.0 # (minimum version for format version 3) - return RObjectType.INT, np.array(data), None + return build_r_object( + RObjectType.INT, + value=np.array(data), + ) assert isinstance(data.step, int) if data.step != 1: # R supports compact sequences only with step 1; # convert the range to an array of values - return RObjectType.INT, np.array(data), None + return build_r_object( + RObjectType.INT, + value=np.array(data), + ) - r_type = RObjectType.ALTREP r_value = ( build_r_list([ converter.convert_to_r_sym("compact_intseq"), @@ -188,8 +204,11 @@ def rangeindex_constructor( ], dtype=float)), converter.convert_to_r_object(None), ) - attributes = None - return r_type, r_value, attributes + + return build_r_object( + RObjectType.ALTREP, + value=r_value, + ) DEFAULT_CONSTRUCTOR_DICT: Final[ConstructorDict] = MappingProxyType({ @@ -610,12 +629,10 @@ def convert_to_r_object( # noqa: C901, PLR0912, PLR0915 # Check available constructors for t, constructor in self.constructor_dict.items(): if isinstance(data, t): - r_type, r_value, attributes = constructor(data, self) - break + return constructor(data, self) - if r_type is None: - msg = f"type {type(data)} not implemented" - raise NotImplementedError(msg) + msg = f"type {type(data)} not implemented" + raise NotImplementedError(msg) if attributes is not None: is_object = "class" in attributes From ad30ca3aa64e5da292759e46debfa4300377dd4d Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Wed, 22 Jan 2025 15:17:06 +0200 Subject: [PATCH 094/100] Fix docstring --- rdata/_write.py | 6 ++++-- rdata/conversion/to_r.py | 12 ++++++++---- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/rdata/_write.py b/rdata/_write.py index a534af0..b655b26 100644 --- a/rdata/_write.py +++ b/rdata/_write.py @@ -42,7 +42,8 @@ def write_rds( compression: Compression. encoding: Encoding to be used for strings within data. format_version: File format version. - constructor_dict: Dictionary mapping Python types to R classes. + constructor_dict: Dictionary mapping Python classes to + functions converting them to R classes. See Also: :func:`write_rda`: Similar function that writes an RDA or RDATA file. @@ -94,7 +95,8 @@ def write_rda( compression: Compression. encoding: Encoding to be used for strings within data. format_version: File format version. - constructor_dict: Dictionary mapping Python types to R classes. + constructor_dict: Dictionary mapping Python classes to + functions converting them to R classes. See Also: :func:`write_rds`: Similar function that writes an RDS file. diff --git a/rdata/conversion/to_r.py b/rdata/conversion/to_r.py index ee80ae6..ff24b65 100644 --- a/rdata/conversion/to_r.py +++ b/rdata/conversion/to_r.py @@ -407,7 +407,8 @@ class ConverterFromPythonToR: encoding: Encoding to be used for strings within data. format_version: File format version. r_version_serialized: R version written as the creator of the object. - constructor_dict: Dictionary mapping Python types to R classes. + constructor_dict: Dictionary mapping Python classes to + functions converting them to R classes. """ def __init__( @@ -425,7 +426,8 @@ def __init__( encoding: Encoding to be used for strings within data. format_version: File format version. r_version_serialized: R version written as the creator of the object. - constructor_dict: Dictionary mapping Python types to R classes. + constructor_dict: Dictionary mapping Python classes to + functions converting them to R classes. """ self.encoding = encoding self.format_version = format_version @@ -665,7 +667,8 @@ def convert_python_to_r_data( encoding: Encoding to be used for strings within data. format_version: File format version. r_version_serialized: R version written as the creator of the object. - constructor_dict: Dictionary mapping Python types to R classes. + constructor_dict: Dictionary mapping Python classes to + functions converting them to R classes. file_type: File type. Returns: @@ -698,7 +701,8 @@ def convert_python_to_r_object( encoding: Encoding to be used for strings within data. format_version: File format version. r_version_serialized: R version written as the creator of the object. - constructor_dict: Dictionary mapping Python types to R classes. + constructor_dict: Dictionary mapping Python classes to + functions converting them to R classes. Returns: Corresponding RObject object. From 8a4758a5935a154513809ccebc98f2c3921bb77c Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Wed, 22 Jan 2025 15:24:01 +0200 Subject: [PATCH 095/100] Do not expose DEFAULT_CONSTRUCTOR_DICT --- rdata/_write.py | 4 +++- rdata/conversion/__init__.py | 1 - 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/rdata/_write.py b/rdata/_write.py index b655b26..225f76f 100644 --- a/rdata/_write.py +++ b/rdata/_write.py @@ -5,10 +5,12 @@ from typing import TYPE_CHECKING from .conversion import ( - DEFAULT_CONSTRUCTOR_DICT, DEFAULT_FORMAT_VERSION, convert_python_to_r_data, ) +from .conversion.to_r import ( + DEFAULT_CONSTRUCTOR_DICT, +) from .unparser import unparse_file if TYPE_CHECKING: diff --git a/rdata/conversion/__init__.py b/rdata/conversion/__init__.py index c5855df..e6b5909 100644 --- a/rdata/conversion/__init__.py +++ b/rdata/conversion/__init__.py @@ -26,7 +26,6 @@ ts_constructor as ts_constructor, ) from .to_r import ( - DEFAULT_CONSTRUCTOR_DICT as DEFAULT_CONSTRUCTOR_DICT, DEFAULT_FORMAT_VERSION as DEFAULT_FORMAT_VERSION, ConverterFromPythonToR as ConverterFromPythonToR, convert_python_to_r_data as convert_python_to_r_data, From 760684fbeb1f06e911409f85a4303f294a6185d7 Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Wed, 22 Jan 2025 15:41:41 +0200 Subject: [PATCH 096/100] Do not expose DEFAULT_FORMAT_VERSION --- rdata/_write.py | 2 +- rdata/conversion/__init__.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/rdata/_write.py b/rdata/_write.py index 225f76f..cce2fc9 100644 --- a/rdata/_write.py +++ b/rdata/_write.py @@ -5,11 +5,11 @@ from typing import TYPE_CHECKING from .conversion import ( - DEFAULT_FORMAT_VERSION, convert_python_to_r_data, ) from .conversion.to_r import ( DEFAULT_CONSTRUCTOR_DICT, + DEFAULT_FORMAT_VERSION, ) from .unparser import unparse_file diff --git a/rdata/conversion/__init__.py b/rdata/conversion/__init__.py index e6b5909..2d5a0ec 100644 --- a/rdata/conversion/__init__.py +++ b/rdata/conversion/__init__.py @@ -26,7 +26,6 @@ ts_constructor as ts_constructor, ) from .to_r import ( - DEFAULT_FORMAT_VERSION as DEFAULT_FORMAT_VERSION, ConverterFromPythonToR as ConverterFromPythonToR, convert_python_to_r_data as convert_python_to_r_data, convert_python_to_r_object as convert_python_to_r_object, From be97c4c8254dff13f9937219e86f6d9a4991fe60 Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Wed, 22 Jan 2025 15:47:56 +0200 Subject: [PATCH 097/100] Remove asserts encoded in type hints --- rdata/conversion/to_r.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/rdata/conversion/to_r.py b/rdata/conversion/to_r.py index ff24b65..f3e3d54 100644 --- a/rdata/conversion/to_r.py +++ b/rdata/conversion/to_r.py @@ -80,7 +80,6 @@ def categorical_constructor( Returns: Components of the R object. """ - assert isinstance(data, pd.Categorical) r_attributes = converter.convert_to_r_attributes({ "levels": data.categories.to_numpy(), "class": "factor", @@ -108,7 +107,6 @@ def dataframe_constructor( Returns: Components of the R object. """ - assert isinstance(data, pd.DataFrame) column_names = [] r_value = [] for column, series in data.items(): @@ -173,7 +171,6 @@ def rangeindex_constructor( Returns: Components of the R object. """ - assert isinstance(data, pd.RangeIndex) if converter.format_version < R_MINIMUM_VERSION_WITH_ALTREP: # ALTREP support is from R version 3.5.0 # (minimum version for format version 3) @@ -516,8 +513,6 @@ def convert_to_r_sym( Returns: R object. """ - assert isinstance(name, str) - # Reference to existing symbol if exists if name in self._references: reference = self._references[name] From d755d089da8a91868eb8eb10f99842bc9037c94b Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Wed, 22 Jan 2025 15:56:15 +0200 Subject: [PATCH 098/100] Add comment on default row names --- rdata/conversion/to_r.py | 1 + 1 file changed, 1 insertion(+) diff --git a/rdata/conversion/to_r.py b/rdata/conversion/to_r.py index f3e3d54..05f2b22 100644 --- a/rdata/conversion/to_r.py +++ b/rdata/conversion/to_r.py @@ -126,6 +126,7 @@ def dataframe_constructor( if isinstance(index, pd.RangeIndex): assert isinstance(index.start, int) if index.start == 1 and index.stop == data.shape[0] + 1 and index.step == 1: + # Construct default row names stored as [R_INT_NA, -len] row_names = np.ma.array( data=[R_INT_NA, -data.shape[0]], mask=[True, False], From a086138ddc57819ba8539bae9b5009a3bd6ad3df Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Wed, 22 Jan 2025 17:08:57 +0200 Subject: [PATCH 099/100] Fix ruff --- rdata/unparser/_ascii.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/rdata/unparser/_ascii.py b/rdata/unparser/_ascii.py index 0881631..bab0461 100644 --- a/rdata/unparser/_ascii.py +++ b/rdata/unparser/_ascii.py @@ -93,8 +93,7 @@ def _unparse_array_values_raw( line = "-Inf" else: line = str(value) - if line.endswith(".0"): - line = line[:-2] + line = line.removesuffix(".0") else: msg = f"Unknown dtype: {array.dtype}" From e50eb9ca752d23d0f5af9d52acc8f225a45f2dfe Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Mon, 27 Jan 2025 11:00:09 +0200 Subject: [PATCH 100/100] Rename default constructors to DEFAULT_CLASS_MAP --- rdata/_write.py | 6 +++--- rdata/conversion/to_r.py | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/rdata/_write.py b/rdata/_write.py index cce2fc9..696546f 100644 --- a/rdata/_write.py +++ b/rdata/_write.py @@ -8,7 +8,7 @@ convert_python_to_r_data, ) from .conversion.to_r import ( - DEFAULT_CONSTRUCTOR_DICT, + DEFAULT_CLASS_MAP, DEFAULT_FORMAT_VERSION, ) from .unparser import unparse_file @@ -29,7 +29,7 @@ def write_rds( compression: Compression = "gzip", encoding: Encoding = "utf-8", format_version: int = DEFAULT_FORMAT_VERSION, - constructor_dict: ConstructorDict = DEFAULT_CONSTRUCTOR_DICT, + constructor_dict: ConstructorDict = DEFAULT_CLASS_MAP, ) -> None: """ Write an RDS file. @@ -82,7 +82,7 @@ def write_rda( compression: Compression = "gzip", encoding: Encoding = "utf-8", format_version: int = DEFAULT_FORMAT_VERSION, - constructor_dict: ConstructorDict = DEFAULT_CONSTRUCTOR_DICT, + constructor_dict: ConstructorDict = DEFAULT_CLASS_MAP, ) -> None: """ Write an RDA or RDATA file. diff --git a/rdata/conversion/to_r.py b/rdata/conversion/to_r.py index 05f2b22..9ec55d3 100644 --- a/rdata/conversion/to_r.py +++ b/rdata/conversion/to_r.py @@ -209,7 +209,7 @@ def rangeindex_constructor( ) -DEFAULT_CONSTRUCTOR_DICT: Final[ConstructorDict] = MappingProxyType({ +DEFAULT_CLASS_MAP: Final[ConstructorDict] = MappingProxyType({ pd.Categorical: categorical_constructor, pd.DataFrame: dataframe_constructor, pd.RangeIndex: rangeindex_constructor, @@ -415,7 +415,7 @@ def __init__( encoding: Encoding = "utf-8", format_version: int = DEFAULT_FORMAT_VERSION, r_version_serialized: int = DEFAULT_R_VERSION_SERIALIZED, - constructor_dict: ConstructorDict = DEFAULT_CONSTRUCTOR_DICT, + constructor_dict: ConstructorDict = DEFAULT_CLASS_MAP, ) -> None: """ Init class. @@ -652,7 +652,7 @@ def convert_python_to_r_data( encoding: Encoding = "utf-8", format_version: int = DEFAULT_FORMAT_VERSION, r_version_serialized: int = DEFAULT_R_VERSION_SERIALIZED, - constructor_dict: ConstructorDict = DEFAULT_CONSTRUCTOR_DICT, + constructor_dict: ConstructorDict = DEFAULT_CLASS_MAP, file_type: FileType = "rds", ) -> RData: """ @@ -687,7 +687,7 @@ def convert_python_to_r_object( encoding: Encoding = "utf-8", format_version: int = DEFAULT_FORMAT_VERSION, r_version_serialized: int = DEFAULT_R_VERSION_SERIALIZED, - constructor_dict: ConstructorDict = DEFAULT_CONSTRUCTOR_DICT, + constructor_dict: ConstructorDict = DEFAULT_CLASS_MAP, ) -> RObject: """ Convert Python data to R object.