diff --git a/rdata/unparser/_ascii.py b/rdata/unparser/_ascii.py index 5aba32e..efb7b66 100644 --- a/rdata/unparser/_ascii.py +++ b/rdata/unparser/_ascii.py @@ -3,7 +3,7 @@ from __future__ import annotations import string -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING import numpy as np @@ -11,10 +11,41 @@ if TYPE_CHECKING: import io + from typing import Any, Final import numpy.typing as npt +def build_byte_to_str_map() -> tuple[str, ...]: + """Build byte-to-string mapping for string conversion.""" + + def escape(b: bytes) -> str: + r"""Escape string, e.g., b'\n' -> r'\n'.""" + return b.decode("latin1").encode("unicode_escape").decode("ascii") + + # Fill mapping with octal codes + byte_to_str = [rf"\{byte:03o}" for byte in range(256)] + + # Update mapping for ascii characters + for byte in string.printable.encode("ascii"): + # Note: indexing bytestring yields ints + assert isinstance(byte, int) + byte_to_str[byte] = escape(bytes([byte])) + + # Update mapping for special characters + byte_to_str[b'"'[0]] = r'\"' + byte_to_str[b"'"[0]] = r"\'" + byte_to_str[b"?"[0]] = r"\?" + byte_to_str[b" "[0]] = r"\040" + byte_to_str[b"\v"[0]] = r"\v" + byte_to_str[b"\f"[0]] = r"\f" + + return tuple(byte_to_str) + + +BYTE_TO_STR: Final = build_byte_to_str_map() + + class UnparserASCII(Unparser): """Unparser for files in ASCII format.""" @@ -73,45 +104,9 @@ def _unparse_string_characters(self, value: bytes) -> None: # i.e., output = value.decode('latin1').encode('unicode_escape').decode('ascii') # This would produce byte representation in hex such as '\xc3\xa4', # but we need to have the equivalent octal presentation '\303\244'. - # So, we need to do somewhat manual conversion instead. - - # List of ascii characters that are written directly; - # this is all printable ascii except - # - ' ' that Python writes as ' ', but R as '\040' - # - '\v' that Python writes as '\x0b', but R as '\v' - # - '\f' that Python writes as '\x0c', but R as '\f' - write_raw = string.printable.replace(" ", "")\ - .replace("\v", "")\ - .replace("\f", "") - - def escape(b: bytes) -> str: - r"""Escape string, e.g., b'\n' -> r'\\n'.""" - return b.decode("latin1").encode("unicode_escape").decode("ascii") - - # Go though the string byte-by-byte as we need to - # convert every non-ascii character separately - output = "" - ascii_buffer = b"" - for byte in value: - if chr(byte) in write_raw: - # Collect ascii characters to substring buffer - ascii_buffer += bytes([byte]) - else: - # Encountered a non-ascii character! - # Escape and add the ascii buffer - output += escape(ascii_buffer) - ascii_buffer = b"" - # Add '\v' or '\f' or non-ascii character in octal presentation - if chr(byte) == "\v": - output += r"\v" - elif chr(byte) == "\f": - output += r"\f" - else: - output += rf"\{byte:03o}" - # Escape and add the remaining ascii buffer - output += escape(ascii_buffer) + # In addition, some ascii characters need to be escaped. - # Escape some more characters like R does - output = output.replace('"', r'\"').replace("'", r"\'").replace("?", r"\?") + # Convert string byte-by-byte + output = "".join(BYTE_TO_STR[byte] for byte in value) self._add_line(output)