Up

sparktk.models.classification.naive_bayes module

# vim: set encoding=utf-8

#  Copyright (c) 2016 Intel Corporation 
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#       http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
#

from sparktk.loggers import log_load; log_load(__name__); del log_load

from sparktk.propobj import PropertiesObject
from sparktk.frame.ops.classification_metrics_value import ClassificationMetricsValue
from sparktk import TkContext

__all__ = ["train", "load", "NaiveBayesModel"]

def train(frame, label_column, observation_columns, lambda_parameter = 1.0):
    """
    Creates a Naive Bayes by training on the given frame

    :param frame: (Frame) frame of training data
    :param label_column: (str) Column containing the label for each observation
    :param observation_columns: (List[str]) Column(s) containing the observations
    :param lambda_parameter: (float) Additive smoothing parameter Default is 1.0

    :return: (NaiveBayesModel) Trained Naive Bayes model

    """
    if frame is None:
        raise ValueError("frame cannot be None")
    tc = frame._tc
    _scala_obj = get_scala_obj(tc)
    scala_model = _scala_obj.train(frame._scala,
                                   label_column,
                                   tc.jutils.convert.to_scala_list_string(observation_columns),
                                   lambda_parameter)
    return NaiveBayesModel(tc, scala_model)


def load(path, tc=TkContext.implicit):
    """load NaiveBayesModel from given path"""
    TkContext.validate(tc)
    return tc.load(path, NaiveBayesModel)


def get_scala_obj(tc):
    """Gets reference to the scala object"""
    return tc.sc._jvm.org.trustedanalytics.sparktk.models.classification.naive_bayes.NaiveBayesModel


class NaiveBayesModel(PropertiesObject):
    """
    A trained Naive Bayes model

    Example
    -------

        >>> frame = tc.frame.create([[1,19.8446136104,2.2985856384],
        ...                          [1,16.8973559126,2.6933495054],
        ...                          [1,5.5548729596, 2.7777687995],
        ...                          [0,46.1810010826,3.1611961917],
        ...                          [0,44.3117586448,3.3458963222],
        ...                          [0,34.6334526911,3.6429838715]],
        ...                          [('Class', int), ('Dim_1', float), ('Dim_2', float)])

        >>> model = tc.models.classification.naive_bayes.train(frame, 'Class', ['Dim_1', 'Dim_2'], 0.9)

        >>> model.label_column
        u'Class'

        >>> model.observation_columns
        [u'Dim_1', u'Dim_2']

        >>> model.lambda_parameter
        0.9

        >>> predicted_frame = model.predict(frame, ['Dim_1', 'Dim_2'])

        >>> predicted_frame.inspect()
        [#]  Class  Dim_1          Dim_2         predicted_class
        ========================================================
        [0]      1  19.8446136104  2.2985856384              0.0
        [1]      1  16.8973559126  2.6933495054              1.0
        [2]      1   5.5548729596  2.7777687995              1.0
        [3]      0  46.1810010826  3.1611961917              0.0
        [4]      0  44.3117586448  3.3458963222              0.0
        [5]      0  34.6334526911  3.6429838715              0.0

        >>> model.save("sandbox/naivebayes")

        >>> restored = tc.load("sandbox/naivebayes")

        >>> restored.label_column == model.label_column
        True

        >>> restored.lambda_parameter == model.lambda_parameter
        True

        >>> set(restored.observation_columns) == set(model.observation_columns)
        True

        >>> metrics = model.test(frame)

        >>> metrics.precision
        1.0

        >>> predicted_frame2 = restored.predict(frame, ['Dim_1', 'Dim_2'])

        >>> predicted_frame2.inspect()
        [#]  Class  Dim_1          Dim_2         predicted_class
        ========================================================
        [0]      1  19.8446136104  2.2985856384              0.0
        [1]      1  16.8973559126  2.6933495054              1.0
        [2]      1   5.5548729596  2.7777687995              1.0
        [3]      0  46.1810010826  3.1611961917              0.0
        [4]      0  44.3117586448  3.3458963222              0.0
        [5]      0  34.6334526911  3.6429838715              0.0


        >>> canonical_path = model.export_to_mar("sandbox/naivebayes.mar")


    """

    def __init__(self, tc, scala_model):
        self._tc = tc
        tc.jutils.validate_is_jvm_instance_of(scala_model, get_scala_obj(tc))
        self._scala = scala_model

    @staticmethod
    def _from_scala(tc, scala_model):
        return NaiveBayesModel(tc, scala_model)

    @property
    def label_column(self):
        return self._scala.labelColumn()

    @property
    def observation_columns(self):
        return self._tc.jutils.convert.from_scala_seq(self._scala.observationColumns())

    @property
    def lambda_parameter(self):
        return self._scala.lambdaParameter()





    def predict(self, future_periods = 0, ts = None):
        """
        Forecasts future periods using ARIMA.

        Provided fitted values of the time series as 1-step ahead forecasts, based on current model parameters, then
        provide future periods of forecast.  We assume AR terms prior to the start of the series are equal to the
        model's intercept term (or 0.0, if fit without an intercept term).  Meanwhile, MA terms prior to the start
        are assumed to be 0.0.  If there is differencing, the first d terms come from the original series.

        :param future_periods: (int) Periods in the future to forecast (beyond length of time series that the
                               model was trained with).
        :param ts: (Optional(List[float])) Optional list of time series values to use as golden values.  If no time
                   series values are provided, the values used during training will be used during forecasting.

        """
        if not isinstance(future_periods, int):
            raise TypeError("'future_periods' parameter must be an integer.")
        if ts is not None:
            if not isinstance(ts, list):
                raise TypeError("'ts' parameter must be a list of float values." )

        ts_predict_values = self._tc.jutils.convert.to_scala_option_list_double(ts)

        return list(self._tc.jutils.convert.from_scala_seq(self._scala.predict(future_periods, ts_predict_values)))





    def predict(self, frame, columns=None):
        """
        Predicts the labels for the observation columns in the given input frame. Creates a new frame
        with the existing columns and a new predicted column.

        Parameters
        ----------

        :param frame: (Frame) Frame used for predicting the values
        :param c: (List[str]) Names of the observation columns.
        :return: (Frame) A new frame containing the original frame's columns and a prediction column
        """
        c = self.__columns_to_option(columns)
        from sparktk.frame.frame import Frame
        return Frame(self._tc, self._scala.predict(frame._scala, c))

    def test(self, frame, columns=None):
        c = self.__columns_to_option(columns)
        return ClassificationMetricsValue(self._tc, self._scala.test(frame._scala, c))

    def __columns_to_option(self, c):
        if c is not None:
            c = self._tc.jutils.convert.to_scala_list_string(c)
        return self._tc.jutils.convert.to_scala_option(c)

    def save(self, path):
        self._scala.save(self._tc._scala_sc, path)

    def export_to_mar(self, path):
        """
        Exports the trained model as a model archive (.mar) to the specified path

        Parameters
        ----------

        :param path: (str) Path to save the trained model
        :return: (str) Full path to the saved .mar file
        """
        if isinstance(path, basestring):
            return self._scala.exportToMar(self._tc._scala_sc, path)

del PropertiesObject

Functions

def load(

path, tc=<class 'sparktk.arguments.implicit'>)

load NaiveBayesModel from given path

def load(path, tc=TkContext.implicit):
    """load NaiveBayesModel from given path"""
    TkContext.validate(tc)
    return tc.load(path, NaiveBayesModel)

def train(

frame, label_column, observation_columns, lambda_parameter=1.0)

Creates a Naive Bayes by training on the given frame

frame(Frame):frame of training data
label_column(str):Column containing the label for each observation
observation_columns(List[str]):Column(s) containing the observations
lambda_parameter(float):Additive smoothing parameter Default is 1.0

Returns(NaiveBayesModel): Trained Naive Bayes model

def train(frame, label_column, observation_columns, lambda_parameter = 1.0):
    """
    Creates a Naive Bayes by training on the given frame

    :param frame: (Frame) frame of training data
    :param label_column: (str) Column containing the label for each observation
    :param observation_columns: (List[str]) Column(s) containing the observations
    :param lambda_parameter: (float) Additive smoothing parameter Default is 1.0

    :return: (NaiveBayesModel) Trained Naive Bayes model

    """
    if frame is None:
        raise ValueError("frame cannot be None")
    tc = frame._tc
    _scala_obj = get_scala_obj(tc)
    scala_model = _scala_obj.train(frame._scala,
                                   label_column,
                                   tc.jutils.convert.to_scala_list_string(observation_columns),
                                   lambda_parameter)
    return NaiveBayesModel(tc, scala_model)

Classes

class NaiveBayesModel

A trained Naive Bayes model

Example:
>>> frame = tc.frame.create([[1,19.8446136104,2.2985856384],
...                          [1,16.8973559126,2.6933495054],
...                          [1,5.5548729596, 2.7777687995],
...                          [0,46.1810010826,3.1611961917],
...                          [0,44.3117586448,3.3458963222],
...                          [0,34.6334526911,3.6429838715]],
...                          [('Class', int), ('Dim_1', float), ('Dim_2', float)])

>>> model = tc.models.classification.naive_bayes.train(frame, 'Class', ['Dim_1', 'Dim_2'], 0.9)

>>> model.label_column
u'Class'

>>> model.observation_columns
[u'Dim_1', u'Dim_2']

>>> model.lambda_parameter
0.9

>>> predicted_frame = model.predict(frame, ['Dim_1', 'Dim_2'])

>>> predicted_frame.inspect()
[#]  Class  Dim_1          Dim_2         predicted_class
========================================================
[0]      1  19.8446136104  2.2985856384              0.0
[1]      1  16.8973559126  2.6933495054              1.0
[2]      1   5.5548729596  2.7777687995              1.0
[3]      0  46.1810010826  3.1611961917              0.0
[4]      0  44.3117586448  3.3458963222              0.0
[5]      0  34.6334526911  3.6429838715              0.0

>>> model.save("sandbox/naivebayes")

>>> restored = tc.load("sandbox/naivebayes")

>>> restored.label_column == model.label_column
True

>>> restored.lambda_parameter == model.lambda_parameter
True

>>> set(restored.observation_columns) == set(model.observation_columns)
True

>>> metrics = model.test(frame)

>>> metrics.precision
1.0

>>> predicted_frame2 = restored.predict(frame, ['Dim_1', 'Dim_2'])

>>> predicted_frame2.inspect()
[#]  Class  Dim_1          Dim_2         predicted_class
========================================================
[0]      1  19.8446136104  2.2985856384              0.0
[1]      1  16.8973559126  2.6933495054              1.0
[2]      1   5.5548729596  2.7777687995              1.0
[3]      0  46.1810010826  3.1611961917              0.0
[4]      0  44.3117586448  3.3458963222              0.0
[5]      0  34.6334526911  3.6429838715              0.0


>>> canonical_path = model.export_to_mar("sandbox/naivebayes.mar")
class NaiveBayesModel(PropertiesObject):
    """
    A trained Naive Bayes model

    Example
    -------

        >>> frame = tc.frame.create([[1,19.8446136104,2.2985856384],
        ...                          [1,16.8973559126,2.6933495054],
        ...                          [1,5.5548729596, 2.7777687995],
        ...                          [0,46.1810010826,3.1611961917],
        ...                          [0,44.3117586448,3.3458963222],
        ...                          [0,34.6334526911,3.6429838715]],
        ...                          [('Class', int), ('Dim_1', float), ('Dim_2', float)])

        >>> model = tc.models.classification.naive_bayes.train(frame, 'Class', ['Dim_1', 'Dim_2'], 0.9)

        >>> model.label_column
        u'Class'

        >>> model.observation_columns
        [u'Dim_1', u'Dim_2']

        >>> model.lambda_parameter
        0.9

        >>> predicted_frame = model.predict(frame, ['Dim_1', 'Dim_2'])

        >>> predicted_frame.inspect()
        [#]  Class  Dim_1          Dim_2         predicted_class
        ========================================================
        [0]      1  19.8446136104  2.2985856384              0.0
        [1]      1  16.8973559126  2.6933495054              1.0
        [2]      1   5.5548729596  2.7777687995              1.0
        [3]      0  46.1810010826  3.1611961917              0.0
        [4]      0  44.3117586448  3.3458963222              0.0
        [5]      0  34.6334526911  3.6429838715              0.0

        >>> model.save("sandbox/naivebayes")

        >>> restored = tc.load("sandbox/naivebayes")

        >>> restored.label_column == model.label_column
        True

        >>> restored.lambda_parameter == model.lambda_parameter
        True

        >>> set(restored.observation_columns) == set(model.observation_columns)
        True

        >>> metrics = model.test(frame)

        >>> metrics.precision
        1.0

        >>> predicted_frame2 = restored.predict(frame, ['Dim_1', 'Dim_2'])

        >>> predicted_frame2.inspect()
        [#]  Class  Dim_1          Dim_2         predicted_class
        ========================================================
        [0]      1  19.8446136104  2.2985856384              0.0
        [1]      1  16.8973559126  2.6933495054              1.0
        [2]      1   5.5548729596  2.7777687995              1.0
        [3]      0  46.1810010826  3.1611961917              0.0
        [4]      0  44.3117586448  3.3458963222              0.0
        [5]      0  34.6334526911  3.6429838715              0.0


        >>> canonical_path = model.export_to_mar("sandbox/naivebayes.mar")


    """

    def __init__(self, tc, scala_model):
        self._tc = tc
        tc.jutils.validate_is_jvm_instance_of(scala_model, get_scala_obj(tc))
        self._scala = scala_model

    @staticmethod
    def _from_scala(tc, scala_model):
        return NaiveBayesModel(tc, scala_model)

    @property
    def label_column(self):
        return self._scala.labelColumn()

    @property
    def observation_columns(self):
        return self._tc.jutils.convert.from_scala_seq(self._scala.observationColumns())

    @property
    def lambda_parameter(self):
        return self._scala.lambdaParameter()





    def predict(self, future_periods = 0, ts = None):
        """
        Forecasts future periods using ARIMA.

        Provided fitted values of the time series as 1-step ahead forecasts, based on current model parameters, then
        provide future periods of forecast.  We assume AR terms prior to the start of the series are equal to the
        model's intercept term (or 0.0, if fit without an intercept term).  Meanwhile, MA terms prior to the start
        are assumed to be 0.0.  If there is differencing, the first d terms come from the original series.

        :param future_periods: (int) Periods in the future to forecast (beyond length of time series that the
                               model was trained with).
        :param ts: (Optional(List[float])) Optional list of time series values to use as golden values.  If no time
                   series values are provided, the values used during training will be used during forecasting.

        """
        if not isinstance(future_periods, int):
            raise TypeError("'future_periods' parameter must be an integer.")
        if ts is not None:
            if not isinstance(ts, list):
                raise TypeError("'ts' parameter must be a list of float values." )

        ts_predict_values = self._tc.jutils.convert.to_scala_option_list_double(ts)

        return list(self._tc.jutils.convert.from_scala_seq(self._scala.predict(future_periods, ts_predict_values)))





    def predict(self, frame, columns=None):
        """
        Predicts the labels for the observation columns in the given input frame. Creates a new frame
        with the existing columns and a new predicted column.

        Parameters
        ----------

        :param frame: (Frame) Frame used for predicting the values
        :param c: (List[str]) Names of the observation columns.
        :return: (Frame) A new frame containing the original frame's columns and a prediction column
        """
        c = self.__columns_to_option(columns)
        from sparktk.frame.frame import Frame
        return Frame(self._tc, self._scala.predict(frame._scala, c))

    def test(self, frame, columns=None):
        c = self.__columns_to_option(columns)
        return ClassificationMetricsValue(self._tc, self._scala.test(frame._scala, c))

    def __columns_to_option(self, c):
        if c is not None:
            c = self._tc.jutils.convert.to_scala_list_string(c)
        return self._tc.jutils.convert.to_scala_option(c)

    def save(self, path):
        self._scala.save(self._tc._scala_sc, path)

    def export_to_mar(self, path):
        """
        Exports the trained model as a model archive (.mar) to the specified path

        Parameters
        ----------

        :param path: (str) Path to save the trained model
        :return: (str) Full path to the saved .mar file
        """
        if isinstance(path, basestring):
            return self._scala.exportToMar(self._tc._scala_sc, path)

Ancestors (in MRO)

Instance variables

var label_column

var lambda_parameter

var observation_columns

Methods

def __init__(

self, tc, scala_model)

def __init__(self, tc, scala_model):
    self._tc = tc
    tc.jutils.validate_is_jvm_instance_of(scala_model, get_scala_obj(tc))
    self._scala = scala_model

def export_to_mar(

self, path)

Exports the trained model as a model archive (.mar) to the specified path

Parameters:
path(str):Path to save the trained model

Returns(str): Full path to the saved .mar file

def export_to_mar(self, path):
    """
    Exports the trained model as a model archive (.mar) to the specified path
    Parameters
    ----------
    :param path: (str) Path to save the trained model
    :return: (str) Full path to the saved .mar file
    """
    if isinstance(path, basestring):
        return self._scala.exportToMar(self._tc._scala_sc, path)

def predict(

self, frame, columns=None)

Predicts the labels for the observation columns in the given input frame. Creates a new frame with the existing columns and a new predicted column.

Parameters:
frame(Frame):Frame used for predicting the values
c(List[str]):Names of the observation columns.

Returns(Frame): A new frame containing the original frame's columns and a prediction column

def predict(self, frame, columns=None):
    """
    Predicts the labels for the observation columns in the given input frame. Creates a new frame
    with the existing columns and a new predicted column.
    Parameters
    ----------
    :param frame: (Frame) Frame used for predicting the values
    :param c: (List[str]) Names of the observation columns.
    :return: (Frame) A new frame containing the original frame's columns and a prediction column
    """
    c = self.__columns_to_option(columns)
    from sparktk.frame.frame import Frame
    return Frame(self._tc, self._scala.predict(frame._scala, c))

def save(

self, path)

def save(self, path):
    self._scala.save(self._tc._scala_sc, path)

def test(

self, frame, columns=None)

def test(self, frame, columns=None):
    c = self.__columns_to_option(columns)
    return ClassificationMetricsValue(self._tc, self._scala.test(frame._scala, c))

def to_dict(

self)

def to_dict(self):
    d = self._properties()
    d.update(self._attributes())
    return d

def to_json(

self)

def to_json(self):
    return json.dumps(self.to_dict())