Skip to content

Commit

Permalink
Merge pull request #32 from trossi/faster-xdr-reader
Browse files Browse the repository at this point in the history
Faster xdr reader
  • Loading branch information
vnmabus authored Jan 16, 2024
2 parents d05024a + ad85c61 commit f81ed84
Show file tree
Hide file tree
Showing 4 changed files with 112 additions and 120 deletions.
2 changes: 1 addition & 1 deletion LICENSE
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
MIT License

Copyright (c) 2018 Carlos Ramos Carreño
Copyright (c) 2018 Rdata developers.

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
Expand Down
4 changes: 4 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@ keywords = [
"r",
"dataset",
]
authors = [
{name = "Carlos Ramos Carreño", email = "[email protected]"},
{name = "Tuomas Rossi", email = "[email protected]"},
]
maintainers = [
{name = "Carlos Ramos Carreño", email = "[email protected]"},
]
Expand Down
174 changes: 55 additions & 119 deletions rdata/parser/_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
import os
import pathlib
import warnings
import xdrlib
from collections.abc import Iterator
from dataclasses import dataclass
from types import MappingProxyType
Expand All @@ -20,12 +19,12 @@
Optional,
Protocol,
Sequence,
TypeVar,
Union,
runtime_checkable,
)

import numpy as np
import numpy.typing as npt

R_INT_NA = -2**31 # noqa: WPS432
"""Value used to represent a missing integer in R."""
Expand Down Expand Up @@ -529,36 +528,64 @@ def __init__(
self.expand_altrep = expand_altrep
self.altrep_constructor_dict = altrep_constructor_dict

def _parse_array(
self,
dtype: np.dtype,
) -> npt.NDArray[Any]:
"""Parse an array composed of an integer (array size) and values."""
length = self.parse_int()
return self._parse_array_values(dtype, length)

@abc.abstractmethod
def _parse_array_values(
self,
dtype: np.dtype,
length: int,
) -> npt.NDArray[Any]:
"""Parse values of an array."""
pass

def parse_bool(self) -> bool:
"""Parse a boolean."""
return bool(self.parse_int())

def parse_nullable_bool(self) -> bool | None:
"""Parse a boolean."""
read_value = self.parse_nullable_int()
if read_value is None:
return None

return bool(read_value)

@abc.abstractmethod
def parse_int(self) -> int:
"""Parse an integer."""
pass
return int(self._parse_array_values(np.int32, 1)[0])

def parse_nullable_bool_array(
self,
fill_value: bool = True,
) -> npt.NDArray[np.bool_] | np.ma.MaskedArray[Any, Any]:
"""Parse a boolean array."""
return self.parse_nullable_int_array(fill_value).astype(np.bool_)

def parse_nullable_int(self) -> int | None: # noqa: D102
result = self.parse_int()
def parse_nullable_int_array(
self,
fill_value: int = R_INT_NA,
) -> npt.NDArray[np.int32] | np.ma.MaskedArray[Any, Any]:
"""Parse an integer array."""

return None if result == R_INT_NA else result
data = self._parse_array(np.int32)
mask = (data == R_INT_NA)
data[mask] = fill_value

@abc.abstractmethod
def parse_double(self) -> float:
"""Parse a double."""
pass
if np.any(mask):
return np.ma.array( # type: ignore
data=data,
mask=mask,
fill_value=fill_value,
)

return data

def parse_complex(self) -> complex:
"""Parse a complex number."""
return complex(self.parse_double(), self.parse_double())
def parse_double_array(self) -> npt.NDArray[np.float64]:
"""Parse a double array."""
return self._parse_array(np.float64)

def parse_complex_array(self) -> npt.NDArray[np.complex128]:
"""Parse a complex array."""
return self._parse_array(np.complex128)

@abc.abstractmethod
def parse_string(self, length: int) -> bytes:
Expand Down Expand Up @@ -661,37 +688,6 @@ def _parse_bytecode(

return (code, constants)

T = TypeVar("T")

def _parse_nullable_array(
self,
dtype: type[T],
parse_function: Callable[[], T | None],
fill_value: T,
) -> np.ndarray[Any, Any] | np.ma.MaskedArray[Any, Any]:

length = self.parse_int()

value = np.empty(length, dtype=dtype)
mask = np.zeros(length, dtype=np.bool_)

for i in range(length):
parsed = parse_function()
if parsed is None:
mask[i] = True
value[i] = fill_value
else:
value[i] = parsed

if np.any(mask):
return np.ma.MaskedArray(
data=value,
mask=mask,
fill_value=fill_value,
)

return value

def parse_R_object(
self,
reference_list: list[RObject] | None = None,
Expand Down Expand Up @@ -832,34 +828,16 @@ def parse_R_object(
)

elif info.type == RObjectType.LGL:
value = self._parse_nullable_array(
dtype=np.bool_,
parse_function=self.parse_nullable_bool,
fill_value=True,
)
value = self.parse_nullable_bool_array()

elif info.type == RObjectType.INT:
value = self._parse_nullable_array(
dtype=np.int32,
parse_function=self.parse_nullable_int,
fill_value=R_INT_NA,
)
value = self.parse_nullable_int_array()

elif info.type == RObjectType.REAL:
length = self.parse_int()

value = np.empty(length, dtype=np.double)

for i in range(length):
value[i] = self.parse_double()
value = self.parse_double_array()

elif info.type == RObjectType.CPLX:
length = self.parse_int()

value = np.empty(length, dtype=np.complex_)

for i in range(length):
value[i] = self.parse_complex()
value = self.parse_complex_array()

elif info.type in {
RObjectType.STR,
Expand Down Expand Up @@ -988,50 +966,6 @@ def parse_R_object(
return result


class ParserXDR(Parser):
"""Parser used when the integers and doubles are in XDR format."""

def __init__(
self,
data: memoryview,
position: int = 0,
*,
expand_altrep: bool = True,
altrep_constructor_dict: AltRepConstructorMap = DEFAULT_ALTREP_MAP,
) -> None:
super().__init__(
expand_altrep=expand_altrep,
altrep_constructor_dict=altrep_constructor_dict,
)
self.data = data
self.position = position
self.xdr_parser = xdrlib.Unpacker(data)

def parse_int(self) -> int: # noqa: D102
self.xdr_parser.set_position(self.position)
result = self.xdr_parser.unpack_int()
self.position = self.xdr_parser.get_position()

return result

def parse_double(self) -> float: # noqa: D102
self.xdr_parser.set_position(self.position)
result = self.xdr_parser.unpack_double()
self.position = self.xdr_parser.get_position()

return result

def parse_string(self, length: int) -> bytes: # noqa: D102
result = self.data[self.position:(self.position + length)]
self.position += length
return bytes(result)

def parse_all(self) -> RData:
rdata = super().parse_all()
assert self.position == len(self.data)
return rdata


def parse_file(
file_or_path: AcceptableFile | os.PathLike[Any] | Traversable | str,
*,
Expand Down Expand Up @@ -1284,6 +1218,8 @@ def parse_rdata_binary(
data = data[len(format_dict[format_type]):]

if format_type is RdataFormats.XDR:
from ._xdr import ParserXDR

parser = ParserXDR(
data,
expand_altrep=expand_altrep,
Expand Down
52 changes: 52 additions & 0 deletions rdata/parser/_xdr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
from __future__ import annotations

import io
import numpy as np
import numpy.typing as npt

from typing import (
Any,
)

from ._parser import (
AltRepConstructorMap,
DEFAULT_ALTREP_MAP,
Parser,
RData,
)


class ParserXDR(Parser):
"""Parser for data in XDR format."""

def __init__(
self,
data: memoryview,
*,
expand_altrep: bool = True,
altrep_constructor_dict: AltRepConstructorMap = DEFAULT_ALTREP_MAP,
) -> None:
super().__init__(
expand_altrep=expand_altrep,
altrep_constructor_dict=altrep_constructor_dict,
)
self.file = io.BytesIO(data)

def _parse_array_values(
self,
dtype: np.dtype,
length: int,
) -> npt.NDArray[Any]:
dtype = np.dtype(dtype)
buffer = self.file.read(length * dtype.itemsize)
# Read in big-endian order and convert to native byte order
return np.frombuffer(buffer, dtype=dtype.newbyteorder('>')).astype(dtype, copy=False)

def parse_string(self, length: int) -> bytes:
return self.file.read(length)

def parse_all(self) -> RData:
rdata = super().parse_all()
# Check that there is no more data in the file
assert self.file.read(1) == b''
return rdata

0 comments on commit f81ed84

Please sign in to comment.