Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: Castra partitioning rework #37

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 100 additions & 0 deletions castra/bloscpack_ext.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
from io import BytesIO

import numpy as np
import blosc
from pandas.msgpack import Packer, unpackb, packb
from bloscpack.abstract_io import PlainSink, pack, unpack
from bloscpack.args import calculate_nchunks, BloscArgs
from bloscpack.numpy_io import PlainNumpySource
from bloscpack.file_io import (CompressedFPSink, CompressedFPSource,
_read_metadata)
from bloscpack.append import append_fp, _rewrite_metadata_fp


pack_array_header = Packer().pack_array_header


class ObjectSeriesSink(PlainSink):
def __init__(self, metadata, encoding='utf8'):
self.metadata = metadata
self.encoding = encoding
length = metadata['length']
nbytes = metadata['nbytes']
header = pack_array_header(length)
head_size = len(header)
self.buffer = np.empty(nbytes + head_size, 'c')
self.buffer[:head_size] = header
self.ptr = self.buffer[head_size:].__array_interface__['data'][0]

def put(self, compressed):
bwritten = blosc.decompress_ptr(compressed, self.ptr)
self.ptr += bwritten

@property
def ndarray(self):
data = unpackb(self.buffer.tobytes(), encoding=self.encoding)
return np.array(data, object, copy=False)


class ObjectSeriesSource(PlainNumpySource):
def __init__(self, series, encoding='utf8'):
length = len(series)
head_size = len(pack_array_header(length))
self.ndarray = np.fromstring(packb(series.tolist(),
encoding=encoding), 'c')[head_size:]
self.size = len(self.ndarray)
self.metadata = {u'length': length, u'nbytes': self.size}
self.ptr = self.ndarray.__array_interface__['data'][0]


def pack_object_series(series, sink, encoding='utf8', chunk_size='1M',
blosc_args=None, bloscpack_args=None,
metadata_args=None):
if blosc_args is None:
blosc_args = BloscArgs(typesize=1)
else:
blosc_args.typesize = 1
source = ObjectSeriesSource(series, encoding=encoding)
nchunks, chunk_size, last_chunk_size = \
calculate_nchunks(source.size, chunk_size)
pack(source, sink, nchunks, chunk_size, last_chunk_size,
metadata=source.metadata, blosc_args=blosc_args,
bloscpack_args=bloscpack_args, metadata_args=metadata_args)


def pack_object_series_file(series, fn, encoding='utf8', chunk_size='1M',
blosc_args=None, bloscpack_args=None,
metadata_args=None):
with open(fn, 'wb') as fp:
sink = CompressedFPSink(fp)
pack_object_series(series, sink, chunk_size=chunk_size,
encoding=encoding, blosc_args=blosc_args,
bloscpack_args=bloscpack_args,
metadata_args=metadata_args)


def append_object_series_file(series, fn, encoding='utf8'):
length = len(series)
head_size = len(pack_array_header(length))
bytes = packb(series.tolist(), encoding=encoding)[head_size:]
nbytes = len(bytes)
with open(fn, 'rb+') as fil:
append_fp(fil, BytesIO(bytes), nbytes)
fil.seek(32)
meta = _read_metadata(fil)[0]
meta[u'length'] += length
meta[u'nbytes'] += nbytes
fil.seek(32)
_rewrite_metadata_fp(fil, meta)


def unpack_object_series(source, encoding='utf8'):
sink = ObjectSeriesSink(source.metadata, encoding)
unpack(source, sink)
return sink.ndarray


def unpack_object_series_file(fn, encoding='utf8'):
with open(fn, 'rb') as fil:
source = CompressedFPSource(fil)
return unpack_object_series(source, encoding)
23 changes: 9 additions & 14 deletions castra/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,12 @@

from functools import partial

import blosc
import bloscpack

import numpy as np
import pandas as pd

from pandas import msgpack
from .bloscpack_ext import pack_object_series_file, unpack_object_series_file


bp_args = bloscpack.BloscpackArgs(offsets=False, checksum='None')
Expand Down Expand Up @@ -157,7 +156,7 @@ def extend(self, df):
df = df.copy()
start = self.partitions.index[-1] + 1
new_index = pd.Index(np.arange(start, start + len(df)),
name = df.index.name)
name=df.index.name)
df.index = new_index
else:
raise ValueError("Index of new dataframe less than known data")
Expand Down Expand Up @@ -277,7 +276,7 @@ def to_dask(self, columns=None):
def pack_file(x, fn, encoding='utf8'):
""" Pack numpy array into filename

Supports binary data with bloscpack and text data with msgpack+blosc
Supports binary data with bloscpack and text data with msgpack+bloscpack

>>> pack_file(np.array([1, 2, 3]), 'foo.blp') # doctest: +SKIP

Expand All @@ -286,17 +285,16 @@ def pack_file(x, fn, encoding='utf8'):
"""
if x.dtype != 'O':
bloscpack.pack_ndarray_file(x, fn, bloscpack_args=bp_args,
blosc_args=blosc_args(x.dtype))
blosc_args=blosc_args(x.dtype))
else:
bytes = blosc.compress(msgpack.packb(x.tolist(), encoding=encoding), 1)
with open(fn, 'wb') as f:
f.write(bytes)
pack_object_series_file(x, fn, encoding, bloscpack_args=bp_args,
blosc_args=blosc_args(x.dtype))


def unpack_file(fn, encoding='utf8'):
""" Unpack numpy array from filename

Supports binary data with bloscpack and text data with msgpack+blosc
Supports binary data with bloscpack and text data with msgpack+bloscpack

>>> unpack_file('foo.blp') # doctest: +SKIP
array([1, 2, 3])
Expand All @@ -306,11 +304,8 @@ def unpack_file(fn, encoding='utf8'):
"""
try:
return bloscpack.unpack_ndarray_file(fn)
except ValueError:
with open(fn, 'rb') as f:
data = msgpack.unpackb(blosc.decompress(f.read()),
encoding=encoding)
return np.array(data, object, copy=False)
except KeyError:
return unpack_object_series_file(fn, encoding)


def coerce_index(dt, o):
Expand Down