sparktk.frame.ops.inspect module
# vim: set encoding=utf-8
# Copyright (c) 2016 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import sparktk.dtypes as dtypes
import numpy as np
spaces_between_cols = 2 # consts
ellipses = '...'
class InspectSettings(object):
"""Global settings for the 'inspect' method
wrap: int or 'stripes'
If set to 'stripes' then inspect prints rows in stripes; if set to an integer N,
rows will be printed in clumps of N columns, where the columns are wrapped")
truncate: int
If set to integer N, all strings will be truncated to length N, including a tagged ellipses
round: int
If set to integer N, all floating point numbers will be rounded and truncated to N digits
width: int
If set to integer N, the print out will try to honor a max line width of N
margin: int (only has meaning in 'stripes' mode)
If set to integer N, the margin for printing names in a stripe will be limited to N characters
with_types: bool
If set to True, header will include the data_type of each column
"""
_unspecified = 'inspect_settings' # sentinel
_default_wrap = 20
_default_truncate = None
_default_round = None
_default_width = 80
_default_margin = None
_default_with_types = False
def __init__(self, wrap=None, truncate=None, round=None, width=None, margin=None, with_types=None):
self._wrap = None
self.wrap = wrap
self._truncate = None
self.truncate = truncate
self._round = None
self.round = round
self._width = None
self.width = width
self._margin = None
self.margin = margin
self._with_types = None
self.with_types = with_types
def reset(self):
"""returns all the settings to their default values"""
# the setters use None to restore to default
self.wrap = None
self.truncate = None
self.round = None
self.width = None
self.margin = None
self.with_types = None
def copy(self,
wrap=_unspecified,
truncate=_unspecified,
round=_unspecified,
width=_unspecified,
margin=_unspecified,
with_types=_unspecified):
"""create a copy of this settings object and override any values specified"""
c = InspectSettings(self.wrap, self.truncate, self.round, self.width, self.margin, self.with_types)
if wrap is not self._unspecified:
c.wrap = wrap
if truncate is not self._unspecified:
c.truncate = truncate
if round is not self._unspecified:
c.round = round
if width is not self._unspecified:
c.width = width
if margin is not self._unspecified:
c.margin = margin
if with_types is not self._unspecified:
c.with_types = with_types
return c
def __repr__(self):
"""displays current settings"""
return """wrap %8s
truncate %8s
round %8s
width %8s
margin %8s
with_types %8s""" % (self.wrap, self.truncate, self.round, self.width, self.margin, self.with_types)
@property
def wrap(self):
"""
If set to 'stripes' then inspect prints rows in stripes; if set to an integer N,
rows will be printed in clumps of N columns, where the columns are wrapped")
"""
return self._wrap
@wrap.setter
def wrap(self, value):
supported_strings = ['stripes']
if value is None:
value = self._default_wrap
if not isinstance(value, (basestring, int, long)) or \
(isinstance(value, basestring) and value not in supported_strings) or \
(isinstance(value, (int, long)) and value <= 0):
raise ValueError("Bad value %s. wrap must be a integer > 0 or one of the following strings: %s" %
(value, ", ".join(supported_strings)))
self._wrap = value
@property
def truncate(self):
"""If set to integer N, all strings will be truncated to length N, including a tagged ellipses"""
return self._truncate
@truncate.setter
def truncate(self, value):
if value is None:
value = self._default_truncate
if value is not None and (not isinstance(value, (int, long)) or value <= 0):
raise ValueError("Bad value %s. truncate must be a integer > 0" % value)
self._truncate = value
@property
def round(self):
"""If set to integer N, all floating point numbers will be rounded and truncated to N digits"""
return self._round
@round.setter
def round(self, value):
if value is None:
value = self._default_round
if value is not None and (not isinstance(value, (int, long)) or value < 0):
raise ValueError("Bad value %s. round must be an integer >= 0" % value)
self._round = value
@property
def width(self):
"""If set to integer N, the print out will try to honor a max line width of N"""
return self._width
@width.setter
def width(self, value):
if value is None:
value = self._default_width
if not isinstance(value, (int, long)) or value <= 0:
raise ValueError("Bad value %s. width must be an integer >= 0" % value)
self._width = value
@property
def margin(self):
"""
(only has meaning in wrap='stripes' mode)
If set to integer N, the margin for printing names in a stripe will be limited to N characters
"""
return self._margin
@margin.setter
def margin(self, value):
if value is None:
value = self._default_margin
if value is not None and (not isinstance(value, (int, long)) or value <= 0):
raise ValueError("Bad value %s. margin must be an integer >= 0" % value)
self._margin = value
@property
def with_types(self):
"""If set to True, header will include the data_type of each column"""
return self._with_types
@with_types.setter
def with_types(self, value):
if value is None:
value = self._default_with_types
if not isinstance(value, bool):
raise ValueError("Bad value %s. with_types must be an integer >= 0" % value)
self._with_types = value
inspect_settings = InspectSettings()
class RowsInspection(object):
"""
class used specifically for inspect, where the __repr__ is the main use case
"""
def __init__(self, rows, schema, offset, format_settings=inspect_settings):
if not isinstance(format_settings, InspectSettings):
raise TypeError("argument format_settings must be type %s" % InspectSettings)
if format_settings.wrap == 'stripes':
self._repr = self._repr_stripes
else:
self.wrap = min(format_settings.wrap, len(rows)) or len(rows)
self._repr = self._repr_wrap
self.rows = rows
self.schema = schema
self.offset = offset
self.truncate = format_settings.truncate
self.round = format_settings.round
self.width = format_settings.width
self.margin = format_settings.margin
self.with_types = format_settings.with_types
self.value_formatters = [self._get_value_formatter(data_type) for name, data_type in schema]
def __repr__(self):
return self._repr()
def _repr_wrap(self):
"""print rows in a 'clumps' style"""
row_index_str_format = '[%s]' + ' ' * spaces_between_cols
def _get_row_index_str(index):
return row_index_str_format % index
row_count = len(self.rows)
row_clump_count = _get_row_clump_count(row_count, self.wrap)
header_sizes = _get_header_entry_sizes(self.schema, self.with_types)
column_spacer = ' ' * spaces_between_cols
lines_list = []
extra_tuples = []
for row_clump_index in xrange(row_clump_count):
if row_clump_index > 0:
lines_list.append('') # extra line for new clump
start_row_index = row_clump_index * self.wrap
stop_row_index = start_row_index + self.wrap
if stop_row_index > row_count:
stop_row_index = row_count
row_index_header = _get_row_index_str('#' * len(str(self.offset+stop_row_index-1)))
margin = len(row_index_header)
col_sizes = _get_col_sizes(self.rows, start_row_index, self.wrap, header_sizes, self.value_formatters)
col_index = 0
while col_index < len(self.schema):
num_cols = _get_num_cols(self.schema, self.width, col_index, col_sizes, margin)
if num_cols == 0:
raise RuntimeError("Internal error, num_cols == 0") # sanity check on algo
header_line = row_index_header + column_spacer.join([pad_right(_get_header_entry(name, data_type, self.with_types), min(self.width - margin, col_sizes[col_index+i])) for i, (name, data_type) in enumerate(self.schema[col_index:col_index+num_cols])])
thick_line = "=" * len(header_line)
lines_list.extend(["", header_line, thick_line])
if row_count:
for row_index in xrange(start_row_index, stop_row_index):
new_line = pad_right(_get_row_index_str(self.offset+row_index), margin) + column_spacer.join([self._get_wrap_entry(data, col_sizes[col_index+i], self.value_formatters[col_index+i], i, extra_tuples) for i, data in enumerate(self.rows[row_index][col_index:col_index+num_cols])])
lines_list.append(new_line.rstrip())
if extra_tuples:
lines_list.extend(_get_lines_from_extra_tuples(extra_tuples, col_sizes[col_index:col_index+num_cols], margin))
col_index += num_cols
return "\n".join(lines_list[1:]) # 1: skips the first blank line caused by the algo
def _repr_stripes(self):
"""print rows as stripes style"""
max_margin = 0
for name, data_type in self.schema:
length = len(_get_header_entry(name, data_type, self.with_types)) + 1 # to account for the '='
if length > max_margin:
max_margin = length
if not self.margin or max_margin < self.margin:
self.margin = max_margin
lines_list = []
rows = self.rows or [['' for entry in self.schema]]
for row_index in xrange(len(rows)):
lines_list.append(self._get_stripe_header(self.offset+row_index))
lines_list.extend([self._get_stripe_entry(i, name, data_type, value)
for i, ((name, data_type), value) in enumerate(zip(self.schema, rows[row_index]))])
return "\n".join(lines_list)
def _get_stripe_header(self, index):
row_number = "[%s]" % index
return row_number + "-" * (self.margin - len(row_number))
def _get_stripe_entry(self, i, name, data_type, value):
entry = _get_header_entry(name, data_type, self.with_types)
return "%s=%s" % (pad_right(entry, self.margin - 1), self.value_formatters[i](value))
def _get_value_formatter(self, data_type):
if self.round and is_type_float(data_type):
return self.get_rounder(data_type)
elif isinstance(data_type, dtypes.vector):
return self.get_vector_formatter()
elif data_type == dtypes.datetime:
return self.get_datetime_formatter()
if self.truncate and is_type_unicode(data_type):
return self.get_truncater()
return identity
@staticmethod
def _get_wrap_entry(data, size, formatter, relative_column_index, extra_tuples):
entry = unicode(formatter(data)).encode('utf-8')
if isinstance(data, basestring):
lines = entry.splitlines()
if len(lines) > 1:
entry = lines[0] # take the first line now, and save the rest in an 'extra' tuple
extra_tuples.append((relative_column_index, lines[1:]))
return pad_right(entry, size)
elif data is None or isinstance(data, list) or isinstance(data, tuple):
return pad_right(entry, size)
else:
return pad_left(entry, size)
def get_truncater(self):
target_len = self.truncate
def truncate_string(s):
return truncate(s, target_len)
return truncate_string
def get_rounder(self, float_type):
num_digits = self.round
if isinstance(float_type, dtypes.vector):
def vector_rounder(value):
return round_vector(value, num_digits)
return vector_rounder
def rounder(value):
return round_float(value, float_type, num_digits)
return rounder
def get_vector_formatter(self):
def format_vector(v):
if v is None:
return None
return "[%s]" % ", ".join(["None" if np.isnan(f) else str(f) for f in v])
return format_vector
def get_datetime_formatter(self):
def format_datetime(d):
from datetime import datetime
if d is None:
return None
elif isinstance(d, long) or isinstance(d, int):
return dtypes.ms_to_datetime_str(d)
elif isinstance(d, datetime):
return d.strftime("%Y-%m-%dT%H:%M:%S.%fZ")
else:
return str(d)
return format_datetime
def _get_header_entry(name, data_type, with_type):
if with_type:
return "%s:%s" % (name, dtypes.dtypes.to_string(data_type))
return name
def _get_header_entry_sizes(schema, with_types):
return [len(_get_header_entry(name, data_type, with_types)) for name, data_type in schema]
def is_type_float(t):
tpe = dtypes.dtypes.get_from_type(t)
return tpe is dtypes.float32 or tpe is dtypes.float64 or isinstance(t, dtypes.vector)
def is_type_unicode(t):
return dtypes.dtypes.get_from_type(t) is unicode
def pad_left(s, target_len):
"""pads string s on the left such that is has at least length target_len"""
return ' ' * (target_len - len(s)) + s
def pad_right(s, target_len):
"""pads string s on the right such that is has at least length target_len"""
return s + ' ' * (target_len - len(s))
def truncate(s, target_len):
"""truncates string to the target_len"""
if target_len < len(ellipses):
raise ValueError("Bad truncate length %s. "
"Must be set to at least %s to allow for a '%s'." % (target_len, len(ellipses), ellipses))
if s is None or len(s) <= target_len:
return s
return s[:target_len - len(ellipses)] + ellipses
def round_float(f, float_type, num_digits):
"""provides a rounded, formatted string for the given number of decimal places"""
if f is None:
return None
value = float_type(f)
max_len = len(str(value).split('.')[1])
padding = '0' * (num_digits - max_len)
template = "%%.%df%s" % (min(num_digits, max_len) or 1, padding)
# todo - remove this float_type force and use type passed in...
import numpy as np
float_type = np.float64
return template % float_type.round(float_type(f), num_digits)
def round_vector(v, num_digits):
"""provides a rounded, formatted string to represent the vector"""
if v is None:
return None
return "[%s]" % ", ".join([round_float(f, dtypes.float64, num_digits) for f in v])
def identity(value):
return value
def _get_col_sizes(rows, row_index, row_count, header_sizes, formatters):
sizes = list(header_sizes)
for r in xrange(row_index, row_index+row_count):
if r < len(rows):
row = rows[r]
for c in xrange(len(sizes)):
value = row[c]
entry = unicode(formatters[c](value))
lines = entry.splitlines()
max = 0
for line in lines:
length = len(line)
if length > max:
max = length
if max > sizes[c]:
sizes[c] = max
return sizes
def _get_num_cols(schema, width, start_col_index, col_sizes, margin):
"""goes through the col_sizes starting at the given index and finds
how many columns can be included on a line"""
num_cols = 0
line_length = margin - spaces_between_cols
while line_length < width and start_col_index + num_cols < len(schema):
candidate = col_sizes[start_col_index + num_cols] + spaces_between_cols
if (line_length + candidate) > width:
if num_cols == 0:
num_cols = 1
break
num_cols += 1
line_length += candidate
return num_cols
def _get_row_clump_count(row_count, wrap):
if row_count == 0:
return 1
return row_count / wrap + (1 if row_count % wrap else 0)
def _get_lines_from_extra_tuples(tuples, col_sizes, margin):
# (for wrap formatting)
# tuples is a list of tuples of the form (relative column index, [lines])
# col_sizes is an array of the col_sizes for the 'current' clump (hence
# the 'relative column index' in the tuples list --these indices match
#
new_lines = [] # list of new, full-fledged extra lines that come from the tuples
def there_are_tuples_in(x):
return bool(len(x))
while there_are_tuples_in(tuples):
tuple_index = 0
new_line_columns = [' ' * margin]
for size_index in xrange(len(col_sizes)):
if tuple_index < len(tuples) and size_index == tuples[tuple_index][0]:
index, lines = tuples[tuple_index] # the 'tuple'
entry = lines.pop(0)
new_line_columns.append(pad_right(entry, col_sizes[size_index]))
if not len(lines):
del tuples[tuple_index] # remove empty tuple, which also naturally moves index to the next tuple
else:
tuple_index += 1 # move on to the next tuple
else:
new_line_columns.append(' ' * (col_sizes[size_index] + spaces_between_cols))
new_lines.append(''.join(new_line_columns).rstrip())
return new_lines
def inspect(self,
n=10,
offset=0,
columns=None,
wrap=inspect_settings._unspecified,
truncate=inspect_settings._unspecified,
round=inspect_settings._unspecified,
width=inspect_settings._unspecified,
margin=inspect_settings._unspecified,
with_types=inspect_settings._unspecified):
"""
Pretty-print of the frame data
Essentially returns a string, but technically returns a RowInspection object which renders a string.
The RowInspection object naturally converts to a str when needed, like when printed or when displayed
by python REPL (i.e. using the object's __repr__). If running in a script and want the inspect output
to be printed, then it must be explicitly printed, then `print frame.inspect()`
Parameters
----------
:param n: (Optional[int]) The number of rows to print
:param offset: (Optional[int]) The number of rows to skip before printing.
:param columns: (Optional[List[str]]) Filter columns to be included. By default, all columns are included.
:param wrap: (Optional[int or 'stripes']) If set to 'stripes' then inspect prints rows in stripes; if set to an
integer N, rows will be printed in clumps of N columns, where the columns are wrapped.
:param truncate: (Optional[int]) If set to integer N, all strings will be truncated to length N, including all
tagged ellipses.
:param round: (Optional[int]) If set to integer N, all floating point numbers will be rounded and truncated to
N digits.
:param width: (Optional[int]) If set to integer N, the print out will try to honor a max line width of N.
:param margin: (Optional[int]) Applies to 'stripes' mode only. If set to integer N, the margin for printing names
in a stripe will be limited to N characters.
:param with_types: (Optinoal[bool]) If set to True, header will include the data_type of each column.
:return: (RowsInspection) An object which naturally converts to a pretty-print string.
Examples
--------
To look at the first 4 rows of data in a frame:
>>> frame.inspect(4)
[#] animal name age weight
==================================
[0] human George 8 542.5
[1] human Ursula 6 495.0
[2] ape Ape 41 400.0
[3] elephant Shep 5 8630.0
# For other examples, see :ref:`example_frame.inspect`.
Note: if the frame data contains unicode characters, this method may raise a Unicode exception when
running in an interactive REPL or otherwise which triggers the standard python repr(). To get around
this problem, explicitly print the unicode of the returned object:
>>> print unicode(frame.inspect())
**Global Settings**
If not specified, the arguments that control formatting receive default values from
'sparktk.inspect_settings'. Make changes there to affect all calls to inspect.
>>> import sparktk
>>> sparktk.inspect_settings
wrap 20
truncate None
round None
width 80
margin None
with_types False
>>> sparktk.inspect_settings.width = 120 # changes inspect to use 120 width globally
>>> sparktk.inspect_settings.truncate = 16 # changes inspect to always truncate strings to 16 chars
>>> sparktk.inspect_settings
wrap 20
truncate 16
round None
width 120
margin None
with_types False
>>> sparktk.inspect_settings.width = None # return value back to default
>>> sparktk.inspect_settings
wrap 20
truncate 16
round None
width 80
margin None
with_types False
>>> sparktk.inspect_settings.reset() # set everything back to default
>>> sparktk.inspect_settings
wrap 20
truncate None
round None
width 80
margin None
with_types False
"""
from sparktk.frame.ops.take import take_rich
format_settings = inspect_settings.copy(wrap, truncate, round, width, margin, with_types)
result = take_rich(self, n, offset, columns)
return RowsInspection(result.data, result.schema, offset=offset, format_settings=format_settings)
Module variables
var ellipses
var inspect_settings
var spaces_between_cols
Functions
def identity(
value)
def identity(value):
return value
def inspect(
self, n=10, offset=0, columns=None, wrap='inspect_settings', truncate='inspect_settings', round='inspect_settings', width='inspect_settings', margin='inspect_settings', with_types='inspect_settings')
Pretty-print of the frame data
Essentially returns a string, but technically returns a RowInspection object which renders a string.
The RowInspection object naturally converts to a str when needed, like when printed or when displayed
by python REPL (i.e. using the object's repr). If running in a script and want the inspect output
to be printed, then it must be explicitly printed, then print frame.inspect()
n | (Optional[int]): | The number of rows to print |
offset | (Optional[int]): | The number of rows to skip before printing. |
columns | (Optional[List[str]]): | Filter columns to be included. By default, all columns are included. |
wrap | (Optional[int or 'stripes']): | If set to 'stripes' then inspect prints rows in stripes; if set to an integer N, rows will be printed in clumps of N columns, where the columns are wrapped. |
truncate | (Optional[int]): | If set to integer N, all strings will be truncated to length N, including all tagged ellipses. |
round | (Optional[int]): | If set to integer N, all floating point numbers will be rounded and truncated to N digits. |
width | (Optional[int]): | If set to integer N, the print out will try to honor a max line width of N. |
margin | (Optional[int]): | Applies to 'stripes' mode only. If set to integer N, the margin for printing names in a stripe will be limited to N characters. |
with_types | (Optinoal[bool]): | If set to True, header will include the data_type of each column. |
Returns | (RowsInspection): | An object which naturally converts to a pretty-print string. |
To look at the first 4 rows of data in a frame:
>>> frame.inspect(4)
[#] animal name age weight
==================================
[0] human George 8 542.5
[1] human Ursula 6 495.0
[2] ape Ape 41 400.0
[3] elephant Shep 5 8630.0
For other examples, see :ref:example_frame.inspect
.
Note: if the frame data contains unicode characters, this method may raise a Unicode exception when running in an interactive REPL or otherwise which triggers the standard python repr(). To get around this problem, explicitly print the unicode of the returned object:
>>> print unicode(frame.inspect())
Global Settings
If not specified, the arguments that control formatting receive default values from 'sparktk.inspect_settings'. Make changes there to affect all calls to inspect.
>>> import sparktk
>>> sparktk.inspect_settings
wrap 20
truncate None
round None
width 80
margin None
with_types False
>>> sparktk.inspect_settings.width = 120 # changes inspect to use 120 width globally
>>> sparktk.inspect_settings.truncate = 16 # changes inspect to always truncate strings to 16 chars
>>> sparktk.inspect_settings
wrap 20
truncate 16
round None
width 120
margin None
with_types False
>>> sparktk.inspect_settings.width = None # return value back to default
>>> sparktk.inspect_settings
wrap 20
truncate 16
round None
width 80
margin None
with_types False
>>> sparktk.inspect_settings.reset() # set everything back to default
>>> sparktk.inspect_settings
wrap 20
truncate None
round None
width 80
margin None
with_types False
def inspect(self,
n=10,
offset=0,
columns=None,
wrap=inspect_settings._unspecified,
truncate=inspect_settings._unspecified,
round=inspect_settings._unspecified,
width=inspect_settings._unspecified,
margin=inspect_settings._unspecified,
with_types=inspect_settings._unspecified):
"""
Pretty-print of the frame data
Essentially returns a string, but technically returns a RowInspection object which renders a string.
The RowInspection object naturally converts to a str when needed, like when printed or when displayed
by python REPL (i.e. using the object's __repr__). If running in a script and want the inspect output
to be printed, then it must be explicitly printed, then `print frame.inspect()`
Parameters
----------
:param n: (Optional[int]) The number of rows to print
:param offset: (Optional[int]) The number of rows to skip before printing.
:param columns: (Optional[List[str]]) Filter columns to be included. By default, all columns are included.
:param wrap: (Optional[int or 'stripes']) If set to 'stripes' then inspect prints rows in stripes; if set to an
integer N, rows will be printed in clumps of N columns, where the columns are wrapped.
:param truncate: (Optional[int]) If set to integer N, all strings will be truncated to length N, including all
tagged ellipses.
:param round: (Optional[int]) If set to integer N, all floating point numbers will be rounded and truncated to
N digits.
:param width: (Optional[int]) If set to integer N, the print out will try to honor a max line width of N.
:param margin: (Optional[int]) Applies to 'stripes' mode only. If set to integer N, the margin for printing names
in a stripe will be limited to N characters.
:param with_types: (Optinoal[bool]) If set to True, header will include the data_type of each column.
:return: (RowsInspection) An object which naturally converts to a pretty-print string.
Examples
--------
To look at the first 4 rows of data in a frame:
>>> frame.inspect(4)
[#] animal name age weight
==================================
[0] human George 8 542.5
[1] human Ursula 6 495.0
[2] ape Ape 41 400.0
[3] elephant Shep 5 8630.0
# For other examples, see :ref:`example_frame.inspect`.
Note: if the frame data contains unicode characters, this method may raise a Unicode exception when
running in an interactive REPL or otherwise which triggers the standard python repr(). To get around
this problem, explicitly print the unicode of the returned object:
>>> print unicode(frame.inspect())
**Global Settings**
If not specified, the arguments that control formatting receive default values from
'sparktk.inspect_settings'. Make changes there to affect all calls to inspect.
>>> import sparktk
>>> sparktk.inspect_settings
wrap 20
truncate None
round None
width 80
margin None
with_types False
>>> sparktk.inspect_settings.width = 120 # changes inspect to use 120 width globally
>>> sparktk.inspect_settings.truncate = 16 # changes inspect to always truncate strings to 16 chars
>>> sparktk.inspect_settings
wrap 20
truncate 16
round None
width 120
margin None
with_types False
>>> sparktk.inspect_settings.width = None # return value back to default
>>> sparktk.inspect_settings
wrap 20
truncate 16
round None
width 80
margin None
with_types False
>>> sparktk.inspect_settings.reset() # set everything back to default
>>> sparktk.inspect_settings
wrap 20
truncate None
round None
width 80
margin None
with_types False
"""
from sparktk.frame.ops.take import take_rich
format_settings = inspect_settings.copy(wrap, truncate, round, width, margin, with_types)
result = take_rich(self, n, offset, columns)
return RowsInspection(result.data, result.schema, offset=offset, format_settings=format_settings)
def is_type_float(
t)
def is_type_float(t):
tpe = dtypes.dtypes.get_from_type(t)
return tpe is dtypes.float32 or tpe is dtypes.float64 or isinstance(t, dtypes.vector)
def is_type_unicode(
t)
def is_type_unicode(t):
return dtypes.dtypes.get_from_type(t) is unicode
def pad_left(
s, target_len)
pads string s on the left such that is has at least length target_len
def pad_left(s, target_len):
"""pads string s on the left such that is has at least length target_len"""
return ' ' * (target_len - len(s)) + s
def pad_right(
s, target_len)
pads string s on the right such that is has at least length target_len
def pad_right(s, target_len):
"""pads string s on the right such that is has at least length target_len"""
return s + ' ' * (target_len - len(s))
def round_float(
f, float_type, num_digits)
provides a rounded, formatted string for the given number of decimal places
def round_float(f, float_type, num_digits):
"""provides a rounded, formatted string for the given number of decimal places"""
if f is None:
return None
value = float_type(f)
max_len = len(str(value).split('.')[1])
padding = '0' * (num_digits - max_len)
template = "%%.%df%s" % (min(num_digits, max_len) or 1, padding)
# todo - remove this float_type force and use type passed in...
import numpy as np
float_type = np.float64
return template % float_type.round(float_type(f), num_digits)
def round_vector(
v, num_digits)
provides a rounded, formatted string to represent the vector
def round_vector(v, num_digits):
"""provides a rounded, formatted string to represent the vector"""
if v is None:
return None
return "[%s]" % ", ".join([round_float(f, dtypes.float64, num_digits) for f in v])
def truncate(
s, target_len)
truncates string to the target_len
def truncate(s, target_len):
"""truncates string to the target_len"""
if target_len < len(ellipses):
raise ValueError("Bad truncate length %s. "
"Must be set to at least %s to allow for a '%s'." % (target_len, len(ellipses), ellipses))
if s is None or len(s) <= target_len:
return s
return s[:target_len - len(ellipses)] + ellipses
Classes
class InspectSettings
Global settings for the 'inspect' method
wrap: int or 'stripes' If set to 'stripes' then inspect prints rows in stripes; if set to an integer N, rows will be printed in clumps of N columns, where the columns are wrapped")
truncate: int If set to integer N, all strings will be truncated to length N, including a tagged ellipses
round: int If set to integer N, all floating point numbers will be rounded and truncated to N digits
width: int If set to integer N, the print out will try to honor a max line width of N
margin: int (only has meaning in 'stripes' mode) If set to integer N, the margin for printing names in a stripe will be limited to N characters
with_types: bool If set to True, header will include the data_type of each column
class InspectSettings(object):
"""Global settings for the 'inspect' method
wrap: int or 'stripes'
If set to 'stripes' then inspect prints rows in stripes; if set to an integer N,
rows will be printed in clumps of N columns, where the columns are wrapped")
truncate: int
If set to integer N, all strings will be truncated to length N, including a tagged ellipses
round: int
If set to integer N, all floating point numbers will be rounded and truncated to N digits
width: int
If set to integer N, the print out will try to honor a max line width of N
margin: int (only has meaning in 'stripes' mode)
If set to integer N, the margin for printing names in a stripe will be limited to N characters
with_types: bool
If set to True, header will include the data_type of each column
"""
_unspecified = 'inspect_settings' # sentinel
_default_wrap = 20
_default_truncate = None
_default_round = None
_default_width = 80
_default_margin = None
_default_with_types = False
def __init__(self, wrap=None, truncate=None, round=None, width=None, margin=None, with_types=None):
self._wrap = None
self.wrap = wrap
self._truncate = None
self.truncate = truncate
self._round = None
self.round = round
self._width = None
self.width = width
self._margin = None
self.margin = margin
self._with_types = None
self.with_types = with_types
def reset(self):
"""returns all the settings to their default values"""
# the setters use None to restore to default
self.wrap = None
self.truncate = None
self.round = None
self.width = None
self.margin = None
self.with_types = None
def copy(self,
wrap=_unspecified,
truncate=_unspecified,
round=_unspecified,
width=_unspecified,
margin=_unspecified,
with_types=_unspecified):
"""create a copy of this settings object and override any values specified"""
c = InspectSettings(self.wrap, self.truncate, self.round, self.width, self.margin, self.with_types)
if wrap is not self._unspecified:
c.wrap = wrap
if truncate is not self._unspecified:
c.truncate = truncate
if round is not self._unspecified:
c.round = round
if width is not self._unspecified:
c.width = width
if margin is not self._unspecified:
c.margin = margin
if with_types is not self._unspecified:
c.with_types = with_types
return c
def __repr__(self):
"""displays current settings"""
return """wrap %8s
truncate %8s
round %8s
width %8s
margin %8s
with_types %8s""" % (self.wrap, self.truncate, self.round, self.width, self.margin, self.with_types)
@property
def wrap(self):
"""
If set to 'stripes' then inspect prints rows in stripes; if set to an integer N,
rows will be printed in clumps of N columns, where the columns are wrapped")
"""
return self._wrap
@wrap.setter
def wrap(self, value):
supported_strings = ['stripes']
if value is None:
value = self._default_wrap
if not isinstance(value, (basestring, int, long)) or \
(isinstance(value, basestring) and value not in supported_strings) or \
(isinstance(value, (int, long)) and value <= 0):
raise ValueError("Bad value %s. wrap must be a integer > 0 or one of the following strings: %s" %
(value, ", ".join(supported_strings)))
self._wrap = value
@property
def truncate(self):
"""If set to integer N, all strings will be truncated to length N, including a tagged ellipses"""
return self._truncate
@truncate.setter
def truncate(self, value):
if value is None:
value = self._default_truncate
if value is not None and (not isinstance(value, (int, long)) or value <= 0):
raise ValueError("Bad value %s. truncate must be a integer > 0" % value)
self._truncate = value
@property
def round(self):
"""If set to integer N, all floating point numbers will be rounded and truncated to N digits"""
return self._round
@round.setter
def round(self, value):
if value is None:
value = self._default_round
if value is not None and (not isinstance(value, (int, long)) or value < 0):
raise ValueError("Bad value %s. round must be an integer >= 0" % value)
self._round = value
@property
def width(self):
"""If set to integer N, the print out will try to honor a max line width of N"""
return self._width
@width.setter
def width(self, value):
if value is None:
value = self._default_width
if not isinstance(value, (int, long)) or value <= 0:
raise ValueError("Bad value %s. width must be an integer >= 0" % value)
self._width = value
@property
def margin(self):
"""
(only has meaning in wrap='stripes' mode)
If set to integer N, the margin for printing names in a stripe will be limited to N characters
"""
return self._margin
@margin.setter
def margin(self, value):
if value is None:
value = self._default_margin
if value is not None and (not isinstance(value, (int, long)) or value <= 0):
raise ValueError("Bad value %s. margin must be an integer >= 0" % value)
self._margin = value
@property
def with_types(self):
"""If set to True, header will include the data_type of each column"""
return self._with_types
@with_types.setter
def with_types(self, value):
if value is None:
value = self._default_with_types
if not isinstance(value, bool):
raise ValueError("Bad value %s. with_types must be an integer >= 0" % value)
self._with_types = value
Ancestors (in MRO)
- InspectSettings
- __builtin__.object
Instance variables
var margin
var round
var truncate
var width
var with_types
var wrap
Methods
def __init__(
self, wrap=None, truncate=None, round=None, width=None, margin=None, with_types=None)
def __init__(self, wrap=None, truncate=None, round=None, width=None, margin=None, with_types=None):
self._wrap = None
self.wrap = wrap
self._truncate = None
self.truncate = truncate
self._round = None
self.round = round
self._width = None
self.width = width
self._margin = None
self.margin = margin
self._with_types = None
self.with_types = with_types
def copy(
self, wrap='inspect_settings', truncate='inspect_settings', round='inspect_settings', width='inspect_settings', margin='inspect_settings', with_types='inspect_settings')
create a copy of this settings object and override any values specified
def copy(self,
wrap=_unspecified,
truncate=_unspecified,
round=_unspecified,
width=_unspecified,
margin=_unspecified,
with_types=_unspecified):
"""create a copy of this settings object and override any values specified"""
c = InspectSettings(self.wrap, self.truncate, self.round, self.width, self.margin, self.with_types)
if wrap is not self._unspecified:
c.wrap = wrap
if truncate is not self._unspecified:
c.truncate = truncate
if round is not self._unspecified:
c.round = round
if width is not self._unspecified:
c.width = width
if margin is not self._unspecified:
c.margin = margin
if with_types is not self._unspecified:
c.with_types = with_types
return c
def reset(
self)
returns all the settings to their default values
def reset(self):
"""returns all the settings to their default values"""
# the setters use None to restore to default
self.wrap = None
self.truncate = None
self.round = None
self.width = None
self.margin = None
self.with_types = None
class RowsInspection
class used specifically for inspect, where the repr is the main use case
class RowsInspection(object):
"""
class used specifically for inspect, where the __repr__ is the main use case
"""
def __init__(self, rows, schema, offset, format_settings=inspect_settings):
if not isinstance(format_settings, InspectSettings):
raise TypeError("argument format_settings must be type %s" % InspectSettings)
if format_settings.wrap == 'stripes':
self._repr = self._repr_stripes
else:
self.wrap = min(format_settings.wrap, len(rows)) or len(rows)
self._repr = self._repr_wrap
self.rows = rows
self.schema = schema
self.offset = offset
self.truncate = format_settings.truncate
self.round = format_settings.round
self.width = format_settings.width
self.margin = format_settings.margin
self.with_types = format_settings.with_types
self.value_formatters = [self._get_value_formatter(data_type) for name, data_type in schema]
def __repr__(self):
return self._repr()
def _repr_wrap(self):
"""print rows in a 'clumps' style"""
row_index_str_format = '[%s]' + ' ' * spaces_between_cols
def _get_row_index_str(index):
return row_index_str_format % index
row_count = len(self.rows)
row_clump_count = _get_row_clump_count(row_count, self.wrap)
header_sizes = _get_header_entry_sizes(self.schema, self.with_types)
column_spacer = ' ' * spaces_between_cols
lines_list = []
extra_tuples = []
for row_clump_index in xrange(row_clump_count):
if row_clump_index > 0:
lines_list.append('') # extra line for new clump
start_row_index = row_clump_index * self.wrap
stop_row_index = start_row_index + self.wrap
if stop_row_index > row_count:
stop_row_index = row_count
row_index_header = _get_row_index_str('#' * len(str(self.offset+stop_row_index-1)))
margin = len(row_index_header)
col_sizes = _get_col_sizes(self.rows, start_row_index, self.wrap, header_sizes, self.value_formatters)
col_index = 0
while col_index < len(self.schema):
num_cols = _get_num_cols(self.schema, self.width, col_index, col_sizes, margin)
if num_cols == 0:
raise RuntimeError("Internal error, num_cols == 0") # sanity check on algo
header_line = row_index_header + column_spacer.join([pad_right(_get_header_entry(name, data_type, self.with_types), min(self.width - margin, col_sizes[col_index+i])) for i, (name, data_type) in enumerate(self.schema[col_index:col_index+num_cols])])
thick_line = "=" * len(header_line)
lines_list.extend(["", header_line, thick_line])
if row_count:
for row_index in xrange(start_row_index, stop_row_index):
new_line = pad_right(_get_row_index_str(self.offset+row_index), margin) + column_spacer.join([self._get_wrap_entry(data, col_sizes[col_index+i], self.value_formatters[col_index+i], i, extra_tuples) for i, data in enumerate(self.rows[row_index][col_index:col_index+num_cols])])
lines_list.append(new_line.rstrip())
if extra_tuples:
lines_list.extend(_get_lines_from_extra_tuples(extra_tuples, col_sizes[col_index:col_index+num_cols], margin))
col_index += num_cols
return "\n".join(lines_list[1:]) # 1: skips the first blank line caused by the algo
def _repr_stripes(self):
"""print rows as stripes style"""
max_margin = 0
for name, data_type in self.schema:
length = len(_get_header_entry(name, data_type, self.with_types)) + 1 # to account for the '='
if length > max_margin:
max_margin = length
if not self.margin or max_margin < self.margin:
self.margin = max_margin
lines_list = []
rows = self.rows or [['' for entry in self.schema]]
for row_index in xrange(len(rows)):
lines_list.append(self._get_stripe_header(self.offset+row_index))
lines_list.extend([self._get_stripe_entry(i, name, data_type, value)
for i, ((name, data_type), value) in enumerate(zip(self.schema, rows[row_index]))])
return "\n".join(lines_list)
def _get_stripe_header(self, index):
row_number = "[%s]" % index
return row_number + "-" * (self.margin - len(row_number))
def _get_stripe_entry(self, i, name, data_type, value):
entry = _get_header_entry(name, data_type, self.with_types)
return "%s=%s" % (pad_right(entry, self.margin - 1), self.value_formatters[i](value))
def _get_value_formatter(self, data_type):
if self.round and is_type_float(data_type):
return self.get_rounder(data_type)
elif isinstance(data_type, dtypes.vector):
return self.get_vector_formatter()
elif data_type == dtypes.datetime:
return self.get_datetime_formatter()
if self.truncate and is_type_unicode(data_type):
return self.get_truncater()
return identity
@staticmethod
def _get_wrap_entry(data, size, formatter, relative_column_index, extra_tuples):
entry = unicode(formatter(data)).encode('utf-8')
if isinstance(data, basestring):
lines = entry.splitlines()
if len(lines) > 1:
entry = lines[0] # take the first line now, and save the rest in an 'extra' tuple
extra_tuples.append((relative_column_index, lines[1:]))
return pad_right(entry, size)
elif data is None or isinstance(data, list) or isinstance(data, tuple):
return pad_right(entry, size)
else:
return pad_left(entry, size)
def get_truncater(self):
target_len = self.truncate
def truncate_string(s):
return truncate(s, target_len)
return truncate_string
def get_rounder(self, float_type):
num_digits = self.round
if isinstance(float_type, dtypes.vector):
def vector_rounder(value):
return round_vector(value, num_digits)
return vector_rounder
def rounder(value):
return round_float(value, float_type, num_digits)
return rounder
def get_vector_formatter(self):
def format_vector(v):
if v is None:
return None
return "[%s]" % ", ".join(["None" if np.isnan(f) else str(f) for f in v])
return format_vector
def get_datetime_formatter(self):
def format_datetime(d):
from datetime import datetime
if d is None:
return None
elif isinstance(d, long) or isinstance(d, int):
return dtypes.ms_to_datetime_str(d)
elif isinstance(d, datetime):
return d.strftime("%Y-%m-%dT%H:%M:%S.%fZ")
else:
return str(d)
return format_datetime
Ancestors (in MRO)
- RowsInspection
- __builtin__.object
Instance variables
var margin
var offset
var round
var rows
var schema
var truncate
var value_formatters
var width
var with_types
Methods
def __init__(
self, rows, schema, offset, format_settings=wrap 20 truncate None round None width 80 margin None with_types False)
def __init__(self, rows, schema, offset, format_settings=inspect_settings):
if not isinstance(format_settings, InspectSettings):
raise TypeError("argument format_settings must be type %s" % InspectSettings)
if format_settings.wrap == 'stripes':
self._repr = self._repr_stripes
else:
self.wrap = min(format_settings.wrap, len(rows)) or len(rows)
self._repr = self._repr_wrap
self.rows = rows
self.schema = schema
self.offset = offset
self.truncate = format_settings.truncate
self.round = format_settings.round
self.width = format_settings.width
self.margin = format_settings.margin
self.with_types = format_settings.with_types
self.value_formatters = [self._get_value_formatter(data_type) for name, data_type in schema]
def get_datetime_formatter(
self)
def get_datetime_formatter(self):
def format_datetime(d):
from datetime import datetime
if d is None:
return None
elif isinstance(d, long) or isinstance(d, int):
return dtypes.ms_to_datetime_str(d)
elif isinstance(d, datetime):
return d.strftime("%Y-%m-%dT%H:%M:%S.%fZ")
else:
return str(d)
return format_datetime
def get_rounder(
self, float_type)
def get_rounder(self, float_type):
num_digits = self.round
if isinstance(float_type, dtypes.vector):
def vector_rounder(value):
return round_vector(value, num_digits)
return vector_rounder
def rounder(value):
return round_float(value, float_type, num_digits)
return rounder
def get_truncater(
self)
def get_truncater(self):
target_len = self.truncate
def truncate_string(s):
return truncate(s, target_len)
return truncate_string
def get_vector_formatter(
self)
def get_vector_formatter(self):
def format_vector(v):
if v is None:
return None
return "[%s]" % ", ".join(["None" if np.isnan(f) else str(f) for f in v])
return format_vector