sparktk.frame.ops.bin_column module
# vim: set encoding=utf-8
# Copyright (c) 2016 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
def bin_column(self, column_name, bins=None, include_lowest=True, strict_binning=False, bin_column_name=None):
"""
Summarize rows of data based on the value in a single column by sorting them
into bins, or groups, based on a list of bin cutoff points or a specified number of
equal-width bins.
Parameters
----------
:param column_name: (str) Name of the column to bin
:param bins: (Optional[List[float]]) Either a single value representing the number of equal-width bins to create, or an array of values
containing bin cutoff points. Array can be list or tuple. If an array is provided, values must be progressively
increasing. All bin boundaries must be included, so, with N bins, you need N+1 values.
Default (None or Empty List) is equal-width bins where the maximum number of bins is the Square-root choice
:math:`\lfloor \sqrt{m} \rfloor`, where :math:`m` is the number of rows.
:param include_lowest: (bool) Specify how the boundary conditions are handled. ``True`` indicates that the lower bound
of the bin is inclusive. ``False`` indicates that the upper bound is inclusive. Default is ``True``.
:param strict_binning: (bool) Specify how values outside of the cutoffs array should be binned. If set to ``True``, each
value less than cutoffs[0] or greater than cutoffs[-1] will be assigned a bin value of -1. If set to ``False``,
values less than cutoffs[0] will be included in the first bin while values greater than cutoffs[-1] will be
included in the final bin.
:param bin_column_name: (str) The name for the new binned column. Default is ``<column_name>_binned``
:return: (List[float]) a list containing the edges of each bin
Notes
-----
1. Bins IDs are 0-index, in other words, the lowest bin number is 0.
+ The first and last cutoffs are always included in the bins.
When *include_lowest* is ``True``, the last bin includes both cutoffs.
When *include_lowest* is ``False``, the first bin (bin 0) includes both
cutoffs.
Examples
--------
For these examples, we will use a frame with column *a* accessed by a Frame
object *my_frame*:
>>> frame.inspect(n=11)
[##] a
========
[0] 1
[1] 1
[2] 2
[3] 3
[4] 5
[5] 8
[6] 13
[7] 21
[8] 34
[9] 55
[10] 89
Modify the frame with a column showing what bin the data is in, by
specifying cutoffs for the bin edges.
The data values should use strict_binning:
>>> frame.bin_column('a', [5, 12, 25, 60], include_lowest=True,
... strict_binning=True, bin_column_name='binned_using_cutoffs')
[===Job Progress===]
>>> frame.inspect(n=11)
[##] a binned_using_cutoffs
==============================
[0] 1 -1
[1] 1 -1
[2] 2 -1
[3] 3 -1
[4] 5 0
[5] 8 0
[6] 13 1
[7] 21 1
[8] 34 2
[9] 55 2
[10] 89 -1
Modify the frame with a column showing what bin the data is in.
The data value should not use strict_binning:
>>> frame.bin_column('a', [5, 12, 25, 60], include_lowest=True,
... strict_binning=False, bin_column_name='binned_using_cutoffs')
[===Job Progress===]
>>> frame.inspect(n=11)
[##] a binned_using_cutoffs
==============================
[0] 1 0
[1] 1 0
[2] 2 0
[3] 3 0
[4] 5 0
[5] 8 0
[6] 13 1
[7] 21 1
[8] 34 2
[9] 55 2
[10] 89 2
Modify the frame with a column showing what bin the data is in.
The bins should be lower inclusive:
>>> frame.bin_column('a', [1,5,34,55,89], include_lowest=True,
... strict_binning=False, bin_column_name='binned_using_cutoffs')
[===Job Progress===]
>>> frame.inspect( n=11 )
[##] a binned_using_cutoffs
==============================
[0] 1 0
[1] 1 0
[2] 2 0
[3] 3 0
[4] 5 1
[5] 8 1
[6] 13 1
[7] 21 1
[8] 34 2
[9] 55 3
[10] 89 3
Modify the frame with a column showing what bin the data is in.
The bins should be upper inclusive:
>>> frame.bin_column('a', [1,5,34,55,89], include_lowest=False,
... strict_binning=True, bin_column_name='binned_using_cutoffs')
[===Job Progress===]
>>> frame.inspect( n=11 )
[##] a binned_using_cutoffs
==============================
[0] 1 0
[1] 1 0
[2] 2 0
[3] 3 0
[4] 5 0
[5] 8 1
[6] 13 1
[7] 21 1
[8] 34 1
[9] 55 2
[10] 89 3
Modify the frame with a column of 3 equal-width bins. This also
returns the cutoffs that were used for creating the bins.
>>> cutoffs = frame.bin_column('a', 3, bin_column_name='equal_width_bins')
>>> print cutoffs
[1.0, 30.333333333333332, 59.666666666666664, 89.0]
>>> frame.inspect(n=frame.count())
[##] a equal_width_bins
==========================
[0] 1 0
[1] 1 0
[2] 2 0
[3] 3 0
[4] 5 0
[5] 8 0
[6] 13 0
[7] 21 0
[8] 34 1
[9] 55 1
[10] 89 2
"""
if isinstance(bins, tuple):
bins = list(bins)
elif not isinstance(bins, list):
bins = [bins]
return self._tc.jutils.convert.from_scala_seq(self._scala.binColumn(column_name,
self._tc.jutils.convert.to_scala_option_list_double(bins),
include_lowest,
strict_binning,
self._tc.jutils.convert.to_scala_option(bin_column_name)))
Functions
def bin_column(
self, column_name, bins=None, include_lowest=True, strict_binning=False, bin_column_name=None)
Summarize rows of data based on the value in a single column by sorting them into bins, or groups, based on a list of bin cutoff points or a specified number of equal-width bins.
column_name | (str): | Name of the column to bin |
bins | (Optional[List[float]]): | Either a single value representing the number of equal-width bins to create, or an array of values containing bin cutoff points. Array can be list or tuple. If an array is provided, values must be progressively increasing. All bin boundaries must be included, so, with N bins, you need N+1 values. Default (None or Empty List) is equal-width bins where the maximum number of bins is the Square-root choice :math:`\lfloor \sqrt{m} floor`, where :math:`m` is the number of rows. |
include_lowest | (bool): | Specify how the boundary conditions are handled. ``True`` indicates that the lower bound of the bin is inclusive. ``False`` indicates that the upper bound is inclusive. Default is ``True``. |
strict_binning | (bool): | Specify how values outside of the cutoffs array should be binned. If set to ``True``, each value less than cutoffs[0] or greater than cutoffs[-1] will be assigned a bin value of -1. If set to ``False``, values less than cutoffs[0] will be included in the first bin while values greater than cutoffs[-1] will be included in the final bin. |
bin_column_name | (str): | The name for the new binned column. Default is ``<column_name>_binned`` |
Returns | (List[float]): | a list containing the edges of each bin |
- Bins IDs are 0-index, in other words, the lowest bin number is 0.
- The first and last cutoffs are always included in the bins.
When include_lowest is
True
, the last bin includes both cutoffs. When include_lowest isFalse
, the first bin (bin 0) includes both cutoffs.
For these examples, we will use a frame with column a accessed by a Frame object my_frame:
>>> frame.inspect(n=11)
[##] a
========
[0] 1
[1] 1
[2] 2
[3] 3
[4] 5
[5] 8
[6] 13
[7] 21
[8] 34
[9] 55
[10] 89
Modify the frame with a column showing what bin the data is in, by specifying cutoffs for the bin edges. The data values should use strict_binning:
>>> frame.bin_column('a', [5, 12, 25, 60], include_lowest=True,
... strict_binning=True, bin_column_name='binned_using_cutoffs')
[===Job Progress===]
>>> frame.inspect(n=11)
[##] a binned_using_cutoffs
==============================
[0] 1 -1
[1] 1 -1
[2] 2 -1
[3] 3 -1
[4] 5 0
[5] 8 0
[6] 13 1
[7] 21 1
[8] 34 2
[9] 55 2
[10] 89 -1
Modify the frame with a column showing what bin the data is in. The data value should not use strict_binning:
>>> frame.bin_column('a', [5, 12, 25, 60], include_lowest=True,
... strict_binning=False, bin_column_name='binned_using_cutoffs')
[===Job Progress===]
>>> frame.inspect(n=11)
[##] a binned_using_cutoffs
==============================
[0] 1 0
[1] 1 0
[2] 2 0
[3] 3 0
[4] 5 0
[5] 8 0
[6] 13 1
[7] 21 1
[8] 34 2
[9] 55 2
[10] 89 2
Modify the frame with a column showing what bin the data is in. The bins should be lower inclusive:
>>> frame.bin_column('a', [1,5,34,55,89], include_lowest=True,
... strict_binning=False, bin_column_name='binned_using_cutoffs')
[===Job Progress===]
>>> frame.inspect( n=11 )
[##] a binned_using_cutoffs
==============================
[0] 1 0
[1] 1 0
[2] 2 0
[3] 3 0
[4] 5 1
[5] 8 1
[6] 13 1
[7] 21 1
[8] 34 2
[9] 55 3
[10] 89 3
Modify the frame with a column showing what bin the data is in. The bins should be upper inclusive:
>>> frame.bin_column('a', [1,5,34,55,89], include_lowest=False,
... strict_binning=True, bin_column_name='binned_using_cutoffs')
[===Job Progress===]
>>> frame.inspect( n=11 )
[##] a binned_using_cutoffs
==============================
[0] 1 0
[1] 1 0
[2] 2 0
[3] 3 0
[4] 5 0
[5] 8 1
[6] 13 1
[7] 21 1
[8] 34 1
[9] 55 2
[10] 89 3
Modify the frame with a column of 3 equal-width bins. This also returns the cutoffs that were used for creating the bins.
>>> cutoffs = frame.bin_column('a', 3, bin_column_name='equal_width_bins')
>>> print cutoffs
[1.0, 30.333333333333332, 59.666666666666664, 89.0]
>>> frame.inspect(n=frame.count())
[##] a equal_width_bins
==========================
[0] 1 0
[1] 1 0
[2] 2 0
[3] 3 0
[4] 5 0
[5] 8 0
[6] 13 0
[7] 21 0
[8] 34 1
[9] 55 1
[10] 89 2
def bin_column(self, column_name, bins=None, include_lowest=True, strict_binning=False, bin_column_name=None):
"""
Summarize rows of data based on the value in a single column by sorting them
into bins, or groups, based on a list of bin cutoff points or a specified number of
equal-width bins.
Parameters
----------
:param column_name: (str) Name of the column to bin
:param bins: (Optional[List[float]]) Either a single value representing the number of equal-width bins to create, or an array of values
containing bin cutoff points. Array can be list or tuple. If an array is provided, values must be progressively
increasing. All bin boundaries must be included, so, with N bins, you need N+1 values.
Default (None or Empty List) is equal-width bins where the maximum number of bins is the Square-root choice
:math:`\lfloor \sqrt{m} \rfloor`, where :math:`m` is the number of rows.
:param include_lowest: (bool) Specify how the boundary conditions are handled. ``True`` indicates that the lower bound
of the bin is inclusive. ``False`` indicates that the upper bound is inclusive. Default is ``True``.
:param strict_binning: (bool) Specify how values outside of the cutoffs array should be binned. If set to ``True``, each
value less than cutoffs[0] or greater than cutoffs[-1] will be assigned a bin value of -1. If set to ``False``,
values less than cutoffs[0] will be included in the first bin while values greater than cutoffs[-1] will be
included in the final bin.
:param bin_column_name: (str) The name for the new binned column. Default is ``<column_name>_binned``
:return: (List[float]) a list containing the edges of each bin
Notes
-----
1. Bins IDs are 0-index, in other words, the lowest bin number is 0.
+ The first and last cutoffs are always included in the bins.
When *include_lowest* is ``True``, the last bin includes both cutoffs.
When *include_lowest* is ``False``, the first bin (bin 0) includes both
cutoffs.
Examples
--------
For these examples, we will use a frame with column *a* accessed by a Frame
object *my_frame*:
>>> frame.inspect(n=11)
[##] a
========
[0] 1
[1] 1
[2] 2
[3] 3
[4] 5
[5] 8
[6] 13
[7] 21
[8] 34
[9] 55
[10] 89
Modify the frame with a column showing what bin the data is in, by
specifying cutoffs for the bin edges.
The data values should use strict_binning:
>>> frame.bin_column('a', [5, 12, 25, 60], include_lowest=True,
... strict_binning=True, bin_column_name='binned_using_cutoffs')
[===Job Progress===]
>>> frame.inspect(n=11)
[##] a binned_using_cutoffs
==============================
[0] 1 -1
[1] 1 -1
[2] 2 -1
[3] 3 -1
[4] 5 0
[5] 8 0
[6] 13 1
[7] 21 1
[8] 34 2
[9] 55 2
[10] 89 -1
Modify the frame with a column showing what bin the data is in.
The data value should not use strict_binning:
>>> frame.bin_column('a', [5, 12, 25, 60], include_lowest=True,
... strict_binning=False, bin_column_name='binned_using_cutoffs')
[===Job Progress===]
>>> frame.inspect(n=11)
[##] a binned_using_cutoffs
==============================
[0] 1 0
[1] 1 0
[2] 2 0
[3] 3 0
[4] 5 0
[5] 8 0
[6] 13 1
[7] 21 1
[8] 34 2
[9] 55 2
[10] 89 2
Modify the frame with a column showing what bin the data is in.
The bins should be lower inclusive:
>>> frame.bin_column('a', [1,5,34,55,89], include_lowest=True,
... strict_binning=False, bin_column_name='binned_using_cutoffs')
[===Job Progress===]
>>> frame.inspect( n=11 )
[##] a binned_using_cutoffs
==============================
[0] 1 0
[1] 1 0
[2] 2 0
[3] 3 0
[4] 5 1
[5] 8 1
[6] 13 1
[7] 21 1
[8] 34 2
[9] 55 3
[10] 89 3
Modify the frame with a column showing what bin the data is in.
The bins should be upper inclusive:
>>> frame.bin_column('a', [1,5,34,55,89], include_lowest=False,
... strict_binning=True, bin_column_name='binned_using_cutoffs')
[===Job Progress===]
>>> frame.inspect( n=11 )
[##] a binned_using_cutoffs
==============================
[0] 1 0
[1] 1 0
[2] 2 0
[3] 3 0
[4] 5 0
[5] 8 1
[6] 13 1
[7] 21 1
[8] 34 1
[9] 55 2
[10] 89 3
Modify the frame with a column of 3 equal-width bins. This also
returns the cutoffs that were used for creating the bins.
>>> cutoffs = frame.bin_column('a', 3, bin_column_name='equal_width_bins')
>>> print cutoffs
[1.0, 30.333333333333332, 59.666666666666664, 89.0]
>>> frame.inspect(n=frame.count())
[##] a equal_width_bins
==========================
[0] 1 0
[1] 1 0
[2] 2 0
[3] 3 0
[4] 5 0
[5] 8 0
[6] 13 0
[7] 21 0
[8] 34 1
[9] 55 1
[10] 89 2
"""
if isinstance(bins, tuple):
bins = list(bins)
elif not isinstance(bins, list):
bins = [bins]
return self._tc.jutils.convert.from_scala_seq(self._scala.binColumn(column_name,
self._tc.jutils.convert.to_scala_option_list_double(bins),
include_lowest,
strict_binning,
self._tc.jutils.convert.to_scala_option(bin_column_name)))