From b4025553a5f11b14a0d985681f39849968e3e747 Mon Sep 17 00:00:00 2001 From: Nick Crews Date: Tue, 21 Jan 2025 10:43:12 -0900 Subject: [PATCH] feat: add to_dicts fixes https://github.com/ibis-project/ibis/issues/9185 --- ibis/backends/__init__.py | 38 ++++++++++++++++++++++++++++++ ibis/backends/tests/test_export.py | 18 ++++++++++++++ ibis/expr/types/core.py | 37 ++++++++++++++++++++++++++++- 3 files changed, 92 insertions(+), 1 deletion(-) diff --git a/ibis/backends/__init__.py b/ibis/backends/__init__.py index c77be762da1a..38a912cbf357 100644 --- a/ibis/backends/__init__.py +++ b/ibis/backends/__init__.py @@ -586,6 +586,44 @@ def to_delta( with expr.to_pyarrow_batches(params=params) as batch_reader: write_deltalake(path, batch_reader, **kwargs) + @util.experimental + def to_dicts( + self, expr: ir.Table, *, chunk_size: int = 1_000_000 + ) -> Iterable[dict[str, Any]]: + """Iterate through each row as a `dict` of column_name -> value. + + Parameters + ---------- + expr + The ibis expression to materialize as an iterable of row dictionaries. + chunk_size + We materialize the results in chunks of this size, to keep memory usage under control. + Larger values probably will be faster but consume more memory. + + Returns + ------- + Iterable[dict[str, Any]] + An iterator of dictionaries, each representing a row in the table. + + Examples + -------- + >>> t = ibis.memtable({"i": [1, 2, 3], "s": ["a", "b", "c"]}) + >>> list(t.to_dicts()) + [{'i': 1, 's': 'a'}, {'i': 2, 's': 'b'}, {'i': 3, 's': 'c'}] + + Single Columns are returned as dictionaries with a single key: + + >>> column = t.i + >>> list(column.to_dicts()) + [{'i': 1}, {'i': 2}, {'i': 3}] + + See Also + -------- + [`Column.to_list`](./expression-generic.qmd##ibis.expr.types.generic.Column.to_list) + """ + for batch in self.to_pyarrow_batches(expr, chunk_size=chunk_size): + yield from batch.to_pylist() + @util.experimental def to_json( self, diff --git a/ibis/backends/tests/test_export.py b/ibis/backends/tests/test_export.py index f2de3959b9fc..861feeadf7fd 100644 --- a/ibis/backends/tests/test_export.py +++ b/ibis/backends/tests/test_export.py @@ -350,6 +350,24 @@ def test_table_to_csv(tmp_path, backend, awards_players): backend.assert_frame_equal(awards_players.to_pandas(), df) +@pytest.mark.parametrize("chunk_size", [1, 1000]) +def test_to_dicts(con, chunk_size): + t = ibis.memtable({"i": [1, 2, 3], "s": ["a", "b", "c"]}) + t = con.create_table("t", t) + + result = list(t.to_dicts(chunk_size=chunk_size)) + expected = [{"i": 1, "s": "a"}, {"i": 2, "s": "b"}, {"i": 3, "s": "c"}] + assert result == expected + + result = list(t.limit(0).to_dicts(chunk_size=chunk_size)) + expected = [] + assert result == expected + + result = list(t.i.to_dicts(chunk_size=chunk_size)) + expected = [{"i": 1}, {"i": 2}, {"i": 3}] + assert result == expected + + @pytest.mark.notimpl( [ "athena", diff --git a/ibis/expr/types/core.py b/ibis/expr/types/core.py index 359cbcd83f70..30abded4c82c 100644 --- a/ibis/expr/types/core.py +++ b/ibis/expr/types/core.py @@ -20,7 +20,7 @@ from ibis.util import experimental if TYPE_CHECKING: - from collections.abc import Iterator, Mapping + from collections.abc import Iterable, Iterator, Mapping from pathlib import Path import pandas as pd @@ -771,6 +771,41 @@ def to_delta( """ self._find_backend(use_default=True).to_delta(self, path, **kwargs) + @experimental + def to_dicts(self, *, chunk_size: int = 1_000_000) -> Iterable[dict[str, Any]]: + """Iterate through each row as a `dict` of column_name -> value. + + Parameters + ---------- + chunk_size + We materialize the results in chunks of this size, to keep memory usage under control. + Larger values probably will be faster but consume more memory. + + Returns + ------- + Iterable[dict[str, Any]] + An iterator of dictionaries, each representing a row in the table. + + Examples + -------- + >>> t = ibis.memtable({"i": [1, 2, 3], "s": ["a", "b", "c"]}) + >>> list(t.to_dicts()) + [{'i': 1, 's': 'a'}, {'i': 2, 's': 'b'}, {'i': 3, 's': 'c'}] + + Single Columns are returned as dictionaries with a single key: + + >>> column = t.i + >>> list(column.to_dicts()) + [{'i': 1}, {'i': 2}, {'i': 3}] + + See Also + -------- + [`Column.to_list`](./expression-generic.qmd##ibis.expr.types.generic.Column.to_list) + """ + return self._find_backend(use_default=True).to_dicts( + self, chunk_size=chunk_size + ) + @experimental def to_json( self,