sparktk.frame.ops.histogram module
# vim: set encoding=utf-8
# Copyright (c) 2016 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from sparktk.propobj import PropertiesObject
class Histogram(PropertiesObject):
def __init__(self, cutoffs, hist, density):
self._cutoffs = cutoffs
self._hist = hist
self._density = density
@property
def cutoffs(self):
return self._cutoffs
@property
def density(self):
return self._density
@property
def hist(self):
return self._hist
def histogram(self, column_name, num_bins=None, weight_column_name=None, bin_type="equalwidth"):
"""
Compute the histogram for a column in a frame.
The returned value is a Histogram object containing 3 lists one each for:
the cutoff points of the bins, size of each bin, and density of each bin.
Parameters
----------
:param column_name: (str) Name of column to be evaluated.
:param num_bins: (Optional[int]) Number of bins in histogram.
Default is Square-root choice will be used
(in other words math.floor(math.sqrt(frame.count())).
:param weight_column_name: (Optional[str]) Name of column containing weights.
Default is all observations are weighted equally.
:param bin_type: (str["equalwidth"|"equaldepth"]) The type of binning algorithm to use:
["equalwidth"|"equaldepth"] Defaults is "equalwidth".
:return: (Histogram) A Histogram object containing the result set.
The data returned is composed of multiple components:
cutoffs : array of float
A list containing the edges of each bin.
hist : array of float
A list containing count of the weighted observations found in each bin.
density : array of float
A list containing a decimal containing the percentage of
observations found in the total set per bin.
Notes
-----
The num_bins parameter is considered to be the maximum permissible number
of bins because the data may dictate fewer bins.
With equal depth binning, for example, if the column to be binned has 10
elements with only 2 distinct values and the *num_bins* parameter is
greater than 2, then the number of actual number of bins will only be 2.
This is due to a restriction that elements with an identical value must
belong to the same bin.
Examples
--------
Consider the following sample data set:
>>> frame.inspect()
[#] a b
=========
[0] a 2
[1] b 7
[2] c 3
[3] d 9
[4] e 1
A simple call for 3 equal-width bins gives:
>>> hist = frame.histogram("b", num_bins=3)
>>> hist.cutoffs
[1.0, 3.6666666666666665, 6.333333333333333, 9.0]
>>> hist.hist
[3.0, 0.0, 2.0]
>>> hist.density
[0.6, 0.0, 0.4]
Switching to equal depth gives:
>>> hist = frame.histogram("b", num_bins=3, bin_type='equaldepth')
>>> hist.cutoffs
[1.0, 2.0, 7.0, 9.0]
>>> hist.hist
[1.0, 2.0, 2.0]
>>> hist.density
[0.2, 0.4, 0.4]
Plot hist as a bar chart using matplotlib:
>>> import matplotlib.pyplot as plt
>>> plt.bar(hist,cutoffs[:1], hist.hist, width=hist.cutoffs[1] - hist.cutoffs[0])
Plot hist as a bar chart using matplotlib:
>>> import matplotlib.pyplot as plt
>>> plt.bar(hist.cutoffs[:1], hist.hist, width=hist.cutoffs[1] -
... hist["cutoffs"][0])
"""
results = self._tc.jutils.convert.scala_map_to_python_with_iterable_values(self._scala.histogram(column_name,
self._tc.jutils.convert.to_scala_option(num_bins),
self._tc.jutils.convert.to_scala_option(weight_column_name),
bin_type))
return Histogram(**results)
Functions
def histogram(
self, column_name, num_bins=None, weight_column_name=None, bin_type='equalwidth')
Compute the histogram for a column in a frame.
The returned value is a Histogram object containing 3 lists one each for: the cutoff points of the bins, size of each bin, and density of each bin.
column_name | (str): | Name of column to be evaluated. |
num_bins | (Optional[int]): | Number of bins in histogram. Default is Square-root choice will be used (in other words math.floor(math.sqrt(frame.count())). |
weight_column_name | (Optional[str]): | Name of column containing weights. Default is all observations are weighted equally. |
bin_type | (str["equalwidth"|"equaldepth"]): | The type of binning algorithm to use: ["equalwidth"|"equaldepth"] Defaults is "equalwidth". |
Returns | (Histogram): | A Histogram object containing the result set.
The data returned is composed of multiple components: cutoffs : array of float A list containing the edges of each bin. hist : array of float A list containing count of the weighted observations found in each bin. density : array of float A list containing a decimal containing the percentage of observations found in the total set per bin. |
The num_bins parameter is considered to be the maximum permissible number of bins because the data may dictate fewer bins. With equal depth binning, for example, if the column to be binned has 10 elements with only 2 distinct values and the num_bins parameter is greater than 2, then the number of actual number of bins will only be 2. This is due to a restriction that elements with an identical value must belong to the same bin.
Consider the following sample data set:
>>> frame.inspect()
[#] a b
=========
[0] a 2
[1] b 7
[2] c 3
[3] d 9
[4] e 1
A simple call for 3 equal-width bins gives:
>>> hist = frame.histogram("b", num_bins=3)
>>> hist.cutoffs
[1.0, 3.6666666666666665, 6.333333333333333, 9.0]
>>> hist.hist
[3.0, 0.0, 2.0]
>>> hist.density
[0.6, 0.0, 0.4]
Switching to equal depth gives:
>>> hist = frame.histogram("b", num_bins=3, bin_type='equaldepth')
>>> hist.cutoffs
[1.0, 2.0, 7.0, 9.0]
>>> hist.hist
[1.0, 2.0, 2.0]
>>> hist.density
[0.2, 0.4, 0.4]
Plot hist as a bar chart using matplotlib:
>>> import matplotlib.pyplot as plt
>>> plt.bar(hist,cutoffs[:1], hist.hist, width=hist.cutoffs[1] - hist.cutoffs[0])
Plot hist as a bar chart using matplotlib:
>>> import matplotlib.pyplot as plt
>>> plt.bar(hist.cutoffs[:1], hist.hist, width=hist.cutoffs[1] -
... hist["cutoffs"][0])
def histogram(self, column_name, num_bins=None, weight_column_name=None, bin_type="equalwidth"):
"""
Compute the histogram for a column in a frame.
The returned value is a Histogram object containing 3 lists one each for:
the cutoff points of the bins, size of each bin, and density of each bin.
Parameters
----------
:param column_name: (str) Name of column to be evaluated.
:param num_bins: (Optional[int]) Number of bins in histogram.
Default is Square-root choice will be used
(in other words math.floor(math.sqrt(frame.count())).
:param weight_column_name: (Optional[str]) Name of column containing weights.
Default is all observations are weighted equally.
:param bin_type: (str["equalwidth"|"equaldepth"]) The type of binning algorithm to use:
["equalwidth"|"equaldepth"] Defaults is "equalwidth".
:return: (Histogram) A Histogram object containing the result set.
The data returned is composed of multiple components:
cutoffs : array of float
A list containing the edges of each bin.
hist : array of float
A list containing count of the weighted observations found in each bin.
density : array of float
A list containing a decimal containing the percentage of
observations found in the total set per bin.
Notes
-----
The num_bins parameter is considered to be the maximum permissible number
of bins because the data may dictate fewer bins.
With equal depth binning, for example, if the column to be binned has 10
elements with only 2 distinct values and the *num_bins* parameter is
greater than 2, then the number of actual number of bins will only be 2.
This is due to a restriction that elements with an identical value must
belong to the same bin.
Examples
--------
Consider the following sample data set:
>>> frame.inspect()
[#] a b
=========
[0] a 2
[1] b 7
[2] c 3
[3] d 9
[4] e 1
A simple call for 3 equal-width bins gives:
>>> hist = frame.histogram("b", num_bins=3)
>>> hist.cutoffs
[1.0, 3.6666666666666665, 6.333333333333333, 9.0]
>>> hist.hist
[3.0, 0.0, 2.0]
>>> hist.density
[0.6, 0.0, 0.4]
Switching to equal depth gives:
>>> hist = frame.histogram("b", num_bins=3, bin_type='equaldepth')
>>> hist.cutoffs
[1.0, 2.0, 7.0, 9.0]
>>> hist.hist
[1.0, 2.0, 2.0]
>>> hist.density
[0.2, 0.4, 0.4]
Plot hist as a bar chart using matplotlib:
>>> import matplotlib.pyplot as plt
>>> plt.bar(hist,cutoffs[:1], hist.hist, width=hist.cutoffs[1] - hist.cutoffs[0])
Plot hist as a bar chart using matplotlib:
>>> import matplotlib.pyplot as plt
>>> plt.bar(hist.cutoffs[:1], hist.hist, width=hist.cutoffs[1] -
... hist["cutoffs"][0])
"""
results = self._tc.jutils.convert.scala_map_to_python_with_iterable_values(self._scala.histogram(column_name,
self._tc.jutils.convert.to_scala_option(num_bins),
self._tc.jutils.convert.to_scala_option(weight_column_name),
bin_type))
return Histogram(**results)
Classes
class Histogram
class Histogram(PropertiesObject):
def __init__(self, cutoffs, hist, density):
self._cutoffs = cutoffs
self._hist = hist
self._density = density
@property
def cutoffs(self):
return self._cutoffs
@property
def density(self):
return self._density
@property
def hist(self):
return self._hist
Ancestors (in MRO)
- Histogram
- sparktk.propobj.PropertiesObject
- __builtin__.object
Instance variables
var cutoffs
var density
var hist
Methods
def __init__(
self, cutoffs, hist, density)
def __init__(self, cutoffs, hist, density):
self._cutoffs = cutoffs
self._hist = hist
self._density = density
def to_dict(
self)
def to_dict(self):
d = self._properties()
d.update(self._attributes())
return d
def to_json(
self)
def to_json(self):
return json.dumps(self.to_dict())