From 777153a727006d7363201d356569392e10af2f8a Mon Sep 17 00:00:00 2001 From: Nick Crews Date: Tue, 21 Jan 2025 12:16:49 -0900 Subject: [PATCH 1/2] feat: add to_dicts fixes https://github.com/ibis-project/ibis/issues/9185 --- ibis/backends/__init__.py | 38 ++++++++++++++++++++++++++++++ ibis/backends/tests/test_export.py | 25 ++++++++++++++++++++ ibis/expr/types/core.py | 37 ++++++++++++++++++++++++++++- 3 files changed, 99 insertions(+), 1 deletion(-) diff --git a/ibis/backends/__init__.py b/ibis/backends/__init__.py index c77be762da1a..db07f207f804 100644 --- a/ibis/backends/__init__.py +++ b/ibis/backends/__init__.py @@ -586,6 +586,44 @@ def to_delta( with expr.to_pyarrow_batches(params=params) as batch_reader: write_deltalake(path, batch_reader, **kwargs) + @util.experimental + def to_dicts( + self, expr: ir.Table, *, chunk_size: int = 1_000_000 + ) -> Iterable[dict[str, Any]]: + """Iterate through each row as a `dict` of column_name -> value. + + Parameters + ---------- + expr + The ibis expression to materialize as an iterable of row dictionaries. + chunk_size + We materialize the results in chunks of this size, to keep memory usage under control. + Larger values probably will be faster but consume more memory. + + Returns + ------- + Iterable[dict[str, Any]] + An iterator of dictionaries, each representing a row in the table. + + Examples + -------- + >>> t = ibis.memtable({"i": [1, 2, 3], "s": ["a", "b", "c"]}) + >>> list(t.to_dicts()) + [{'i': 1, 's': 'a'}, {'i': 2, 's': 'b'}, {'i': 3, 's': 'c'}] + + Single Columns are returned as dictionaries with a single key: + + >>> column = t.i + >>> list(column.to_dicts()) + [{'i': 1}, {'i': 2}, {'i': 3}] + + See Also + -------- + [`Column.to_list`](./expression-generic.qmd#ibis.expr.types.generic.Column.to_list) + """ + for batch in self.to_pyarrow_batches(expr, chunk_size=chunk_size): + yield from batch.to_pylist() + @util.experimental def to_json( self, diff --git a/ibis/backends/tests/test_export.py b/ibis/backends/tests/test_export.py index f2de3959b9fc..d93ee721109d 100644 --- a/ibis/backends/tests/test_export.py +++ b/ibis/backends/tests/test_export.py @@ -350,6 +350,31 @@ def test_table_to_csv(tmp_path, backend, awards_players): backend.assert_frame_equal(awards_players.to_pandas(), df) +@pytest.mark.parametrize("chunk_size", [1, 1000]) +def test_to_dicts(chunk_size, awards_players): + t = ( + awards_players.select("playerID", "yearID") + .order_by("playerID", "yearID") + .limit(3) + ) + + result = list(t.to_dicts(chunk_size=chunk_size)) + expected = [ + {"playerID": "aaronha01", "yearID": 1956}, + {"playerID": "aaronha01", "yearID": 1956}, + {"playerID": "aaronha01", "yearID": 1957}, + ] + assert result == expected + + result = list(t.limit(0).to_dicts(chunk_size=chunk_size)) + expected = [] + assert result == expected + + result = list(t.yearID.to_dicts(chunk_size=chunk_size)) + expected = [{"yearID": 1956}, {"yearID": 1956}, {"yearID": 1957}] + assert result == expected + + @pytest.mark.notimpl( [ "athena", diff --git a/ibis/expr/types/core.py b/ibis/expr/types/core.py index 359cbcd83f70..94f1b069d026 100644 --- a/ibis/expr/types/core.py +++ b/ibis/expr/types/core.py @@ -20,7 +20,7 @@ from ibis.util import experimental if TYPE_CHECKING: - from collections.abc import Iterator, Mapping + from collections.abc import Iterable, Iterator, Mapping from pathlib import Path import pandas as pd @@ -771,6 +771,41 @@ def to_delta( """ self._find_backend(use_default=True).to_delta(self, path, **kwargs) + @experimental + def to_dicts(self, *, chunk_size: int = 1_000_000) -> Iterable[dict[str, Any]]: + """Iterate through each row as a `dict` of column_name -> value. + + Parameters + ---------- + chunk_size + We materialize the results in chunks of this size, to keep memory usage under control. + Larger values probably will be faster but consume more memory. + + Returns + ------- + Iterable[dict[str, Any]] + An iterator of dictionaries, each representing a row in the table. + + Examples + -------- + >>> t = ibis.memtable({"i": [1, 2, 3], "s": ["a", "b", "c"]}) + >>> list(t.to_dicts()) + [{'i': 1, 's': 'a'}, {'i': 2, 's': 'b'}, {'i': 3, 's': 'c'}] + + Single Columns are returned as dictionaries with a single key: + + >>> column = t.i + >>> list(column.to_dicts()) + [{'i': 1}, {'i': 2}, {'i': 3}] + + See Also + -------- + [`Column.to_list`](./expression-generic.qmd#ibis.expr.types.generic.Column.to_list) + """ + return self._find_backend(use_default=True).to_dicts( + self, chunk_size=chunk_size + ) + @experimental def to_json( self, From 5c951704cb5b5ba865246de0ca2520a9a322aa70 Mon Sep 17 00:00:00 2001 From: Nick Crews Date: Tue, 21 Jan 2025 15:31:53 -0900 Subject: [PATCH 2/2] test: mark druid as notimpl --- ibis/backends/tests/test_export.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ibis/backends/tests/test_export.py b/ibis/backends/tests/test_export.py index d93ee721109d..1582c2b4486c 100644 --- a/ibis/backends/tests/test_export.py +++ b/ibis/backends/tests/test_export.py @@ -350,6 +350,11 @@ def test_table_to_csv(tmp_path, backend, awards_players): backend.assert_frame_equal(awards_players.to_pandas(), df) +@pytest.mark.notimpl( + ["druid"], + raises=PyDruidProgrammingError, + reason="druid can only order by time columns", +) @pytest.mark.parametrize("chunk_size", [1, 1000]) def test_to_dicts(chunk_size, awards_players): t = (