sparktk.models.classification.svm module
# vim: set encoding=utf-8
# Copyright (c) 2016 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from sparktk.loggers import log_load; log_load(__name__); del log_load
from sparktk.propobj import PropertiesObject
from sparktk.frame.ops.classification_metrics_value import ClassificationMetricsValue
from sparktk import TkContext
__all__ = ["train", "load", "SvmModel"]
def train(frame,
label_column,
observation_columns,
intercept = True,
num_iterations = 100,
step_size = 1.0,
reg_type = None,
reg_param = 0.01,
mini_batch_fraction = 1.0):
"""
Creates a Svm Model by training on the given frame
Parameters
----------
:param frame: (Frame) frame of training data
:param label_column: (str) Column containing the label for each observation
:param observation_columns: (list(str)) Column(s) containing the observations
:param intercept: (boolean) Flag indicating if the algorithm adds an intercept. Default is true
:param num_iterations: (int) Number of iterations for SGD. Default is 100
:param step_size: (float) Initial step size for SGD optimizer for the first step. Default is 1.0
:param reg_type: (Optional(str)) Regularization "L1" or "L2". Default is "L2"
:param reg_param: (float) Regularization parameter. Default is 0.01
:param mini_batch_fraction: (float) Set fraction of data to be used for each SGD iteration. Default is 1.0; corresponding to deterministic/classical gradient descent
:return: (SvmModel) The SVM trained model (with SGD)
Notes
-----
Support Vector Machine is a supervised algorithm used to perform binary classification. A Support Vector Machine
constructs a high dimensional hyperplane which is said to achieve a good separation when a hyperplane has the
largest distance to the nearest training-data point of any class. This model runs the MLLib implementation of SVM
with SGD optimizer. The SVM model is initialized, trained on columns of a frame, used to predict the labels
of observations in a frame, and tests the predicted labels against the true labels. During testing, labels of the
observations are predicted and tested against the true labels using built-in binary Classification Metrics.
"""
if frame is None:
raise ValueError("frame cannot be None")
tc = frame._tc
_scala_obj = get_scala_obj(tc)
scala_model = _scala_obj.train(frame._scala,
label_column,
tc.jutils.convert.to_scala_list_string(observation_columns),
intercept,
num_iterations,
step_size,
tc.jutils.convert.to_scala_option(reg_type),
reg_param,
mini_batch_fraction)
return SvmModel(tc, scala_model)
def load(path, tc=TkContext.implicit):
"""load SvmModel from given path"""
TkContext.validate(tc)
return tc.load(path, SvmModel)
def get_scala_obj(tc):
"""Gets reference to the scala object"""
return tc.sc._jvm.org.trustedanalytics.sparktk.models.classification.svm.SvmModel
class SvmModel(PropertiesObject):
"""
A trained Svm model
Example
-------
>>> frame = tc.frame.create([[-48.0,1], [-75.0,1], [-63.0,1], [-57.0,1], [73.0,0], [-33.0,1], [100.0,0],
... [-54.0,1], [78.0,0], [48.0,0], [-55.0,1], [23.0,0], [45.0,0], [75.0,0]],
... [("data", float),("label", str)])
>>> frame.inspect()
[#] data label
=================
[0] -48.0 1
[1] -75.0 1
[2] -63.0 1
[3] -57.0 1
[4] 73.0 0
[5] -33.0 1
[6] 100.0 0
[7] -54.0 1
[8] 78.0 0
[9] 48.0 0
>>> model = tc.models.classification.svm.train(frame, 'label', ['data'])
>>> model.label_column
u'label'
>>> model.observation_columns
[u'data']
>>> predicted_frame = model.predict(frame, ['data'])
>>> predicted_frame.inspect()
[#] data label predicted_label
==================================
[0] -48.0 1 1
[1] -75.0 1 1
[2] -63.0 1 1
[3] -57.0 1 1
[4] 73.0 0 0
[5] -33.0 1 1
[6] 100.0 0 0
[7] -54.0 1 1
[8] 78.0 0 0
[9] 48.0 0 0
>>> test_metrics = model.test(predicted_frame)
>>> test_metrics
accuracy = 1.0
confusion_matrix = Predicted_Pos Predicted_Neg
Actual_Pos 7 0
Actual_Neg 0 7
f_measure = 1.0
precision = 1.0
recall = 1.0
>>> model.save("sandbox/svm")
>>> restored = tc.load("sandbox/svm")
>>> restored.label_column == model.label_column
True
>>> restored.intercept == model.intercept
True
>>> set(restored.observation_columns) == set(model.observation_columns)
True
>>> predicted_frame2 = restored.predict(frame)
>>> predicted_frame2.inspect()
[#] data label predicted_label
==================================
[0] -48.0 1 1
[1] -75.0 1 1
[2] -63.0 1 1
[3] -57.0 1 1
[4] 73.0 0 0
[5] -33.0 1 1
[6] 100.0 0 0
[7] -54.0 1 1
[8] 78.0 0 0
[9] 48.0 0 0
>>> canonical_path = model.export_to_mar("sandbox/SVM.mar")
"""
def __init__(self, tc, scala_model):
self._tc = tc
tc.jutils.validate_is_jvm_instance_of(scala_model, get_scala_obj(tc))
self._scala = scala_model
@staticmethod
def _from_scala(tc, scala_model):
return SvmModel(tc, scala_model)
@property
def label_column(self):
"""column containing the label used during model training"""
return self._scala.labelColumn()
@property
def observation_columns(self):
"""columns containing the observation values used during model training"""
return self._tc.jutils.convert.from_scala_seq(self._scala.observationColumns())
@property
def intercept(self):
"""intercept used during model training"""
return self._scala.intercept()
@property
def num_iterations(self):
"""max number of iterations allowed during model training"""
return self._scala.numIterations()
@property
def step_size(self):
"""step size value used to train the model"""
return self._scala.stepSize()
@property
def reg_type(self):
"""regularization type used to train the model"""
return self._tc.jutils.convert.from_scala_option(self._scala.regType())
@property
def reg_param(self):
"""regularization parameter used to train the model"""
return self._scala.regParam()
@property
def mini_batch_fraction(self):
"""minimum batch fraction used to train the model"""
return self._scala.miniBatchFraction()
def predict(self, frame, columns=None):
"""
Predicts the labels for the observation columns in the given input frame. Creates a new frame
with the existing columns and a new predicted column.
Parameters
----------
:param frame: (Frame) Frame used for predicting the values
:param c: (List[str]) Names of the observation columns.
:return: (Frame) A new frame containing the original frame's columns and a prediction column
"""
c = self.__columns_to_option(columns)
from sparktk.frame.frame import Frame
return Frame(self._tc, self._scala.predict(frame._scala, c))
def test(self, frame, columns=None):
"""test the frame given the trained model"""
c = self.__columns_to_option(columns)
return ClassificationMetricsValue(self._tc, self._scala.test(frame._scala, c))
def __columns_to_option(self, c):
if c is not None:
c = self._tc.jutils.convert.to_scala_list_string(c)
return self._tc.jutils.convert.to_scala_option(c)
def save(self, path):
"""save the trained model to path"""
self._scala.save(self._tc._scala_sc, path)
def export_to_mar(self, path):
"""
Exports the trained model as a model archive (.mar) to the specified path
Parameters
----------
:param path: (str) Path to save the trained model
:return: (str) Full path to the saved .mar file
"""
if isinstance(path, basestring):
return self._scala.exportToMar(self._tc._scala_sc, path)
del PropertiesObject
Functions
def load(
path, tc=<class 'sparktk.arguments.implicit'>)
load SvmModel from given path
def load(path, tc=TkContext.implicit):
"""load SvmModel from given path"""
TkContext.validate(tc)
return tc.load(path, SvmModel)
def train(
frame, label_column, observation_columns, intercept=True, num_iterations=100, step_size=1.0, reg_type=None, reg_param=0.01, mini_batch_fraction=1.0)
Creates a Svm Model by training on the given frame
frame | (Frame): | frame of training data |
label_column | (str): | Column containing the label for each observation |
observation_columns | (list(str)): | Column(s) containing the observations |
intercept | (boolean): | Flag indicating if the algorithm adds an intercept. Default is true |
num_iterations | (int): | Number of iterations for SGD. Default is 100 |
step_size | (float): | Initial step size for SGD optimizer for the first step. Default is 1.0 |
reg_type | (Optional(str)): | Regularization "L1" or "L2". Default is "L2" |
reg_param | (float): | Regularization parameter. Default is 0.01 |
mini_batch_fraction | (float): | Set fraction of data to be used for each SGD iteration. Default is 1.0; corresponding to deterministic/classical gradient descent |
Returns | (SvmModel): | The SVM trained model (with SGD) |
Support Vector Machine is a supervised algorithm used to perform binary classification. A Support Vector Machine constructs a high dimensional hyperplane which is said to achieve a good separation when a hyperplane has the largest distance to the nearest training-data point of any class. This model runs the MLLib implementation of SVM with SGD optimizer. The SVM model is initialized, trained on columns of a frame, used to predict the labels of observations in a frame, and tests the predicted labels against the true labels. During testing, labels of the observations are predicted and tested against the true labels using built-in binary Classification Metrics.
def train(frame,
label_column,
observation_columns,
intercept = True,
num_iterations = 100,
step_size = 1.0,
reg_type = None,
reg_param = 0.01,
mini_batch_fraction = 1.0):
"""
Creates a Svm Model by training on the given frame
Parameters
----------
:param frame: (Frame) frame of training data
:param label_column: (str) Column containing the label for each observation
:param observation_columns: (list(str)) Column(s) containing the observations
:param intercept: (boolean) Flag indicating if the algorithm adds an intercept. Default is true
:param num_iterations: (int) Number of iterations for SGD. Default is 100
:param step_size: (float) Initial step size for SGD optimizer for the first step. Default is 1.0
:param reg_type: (Optional(str)) Regularization "L1" or "L2". Default is "L2"
:param reg_param: (float) Regularization parameter. Default is 0.01
:param mini_batch_fraction: (float) Set fraction of data to be used for each SGD iteration. Default is 1.0; corresponding to deterministic/classical gradient descent
:return: (SvmModel) The SVM trained model (with SGD)
Notes
-----
Support Vector Machine is a supervised algorithm used to perform binary classification. A Support Vector Machine
constructs a high dimensional hyperplane which is said to achieve a good separation when a hyperplane has the
largest distance to the nearest training-data point of any class. This model runs the MLLib implementation of SVM
with SGD optimizer. The SVM model is initialized, trained on columns of a frame, used to predict the labels
of observations in a frame, and tests the predicted labels against the true labels. During testing, labels of the
observations are predicted and tested against the true labels using built-in binary Classification Metrics.
"""
if frame is None:
raise ValueError("frame cannot be None")
tc = frame._tc
_scala_obj = get_scala_obj(tc)
scala_model = _scala_obj.train(frame._scala,
label_column,
tc.jutils.convert.to_scala_list_string(observation_columns),
intercept,
num_iterations,
step_size,
tc.jutils.convert.to_scala_option(reg_type),
reg_param,
mini_batch_fraction)
return SvmModel(tc, scala_model)
Classes
class SvmModel
A trained Svm model
>>> frame = tc.frame.create([[-48.0,1], [-75.0,1], [-63.0,1], [-57.0,1], [73.0,0], [-33.0,1], [100.0,0],
... [-54.0,1], [78.0,0], [48.0,0], [-55.0,1], [23.0,0], [45.0,0], [75.0,0]],
... [("data", float),("label", str)])
>>> frame.inspect()
[#] data label
=================
[0] -48.0 1
[1] -75.0 1
[2] -63.0 1
[3] -57.0 1
[4] 73.0 0
[5] -33.0 1
[6] 100.0 0
[7] -54.0 1
[8] 78.0 0
[9] 48.0 0
>>> model = tc.models.classification.svm.train(frame, 'label', ['data'])
>>> model.label_column
u'label'
>>> model.observation_columns
[u'data']
>>> predicted_frame = model.predict(frame, ['data'])
>>> predicted_frame.inspect()
[#] data label predicted_label
==================================
[0] -48.0 1 1
[1] -75.0 1 1
[2] -63.0 1 1
[3] -57.0 1 1
[4] 73.0 0 0
[5] -33.0 1 1
[6] 100.0 0 0
[7] -54.0 1 1
[8] 78.0 0 0
[9] 48.0 0 0
>>> test_metrics = model.test(predicted_frame)
>>> test_metrics
accuracy = 1.0
confusion_matrix = Predicted_Pos Predicted_Neg
Actual_Pos 7 0
Actual_Neg 0 7
f_measure = 1.0
precision = 1.0
recall = 1.0
>>> model.save("sandbox/svm")
>>> restored = tc.load("sandbox/svm")
>>> restored.label_column == model.label_column
True
>>> restored.intercept == model.intercept
True
>>> set(restored.observation_columns) == set(model.observation_columns)
True
>>> predicted_frame2 = restored.predict(frame)
>>> predicted_frame2.inspect()
[#] data label predicted_label
==================================
[0] -48.0 1 1
[1] -75.0 1 1
[2] -63.0 1 1
[3] -57.0 1 1
[4] 73.0 0 0
[5] -33.0 1 1
[6] 100.0 0 0
[7] -54.0 1 1
[8] 78.0 0 0
[9] 48.0 0 0
>>> canonical_path = model.export_to_mar("sandbox/SVM.mar")
class SvmModel(PropertiesObject):
"""
A trained Svm model
Example
-------
>>> frame = tc.frame.create([[-48.0,1], [-75.0,1], [-63.0,1], [-57.0,1], [73.0,0], [-33.0,1], [100.0,0],
... [-54.0,1], [78.0,0], [48.0,0], [-55.0,1], [23.0,0], [45.0,0], [75.0,0]],
... [("data", float),("label", str)])
>>> frame.inspect()
[#] data label
=================
[0] -48.0 1
[1] -75.0 1
[2] -63.0 1
[3] -57.0 1
[4] 73.0 0
[5] -33.0 1
[6] 100.0 0
[7] -54.0 1
[8] 78.0 0
[9] 48.0 0
>>> model = tc.models.classification.svm.train(frame, 'label', ['data'])
>>> model.label_column
u'label'
>>> model.observation_columns
[u'data']
>>> predicted_frame = model.predict(frame, ['data'])
>>> predicted_frame.inspect()
[#] data label predicted_label
==================================
[0] -48.0 1 1
[1] -75.0 1 1
[2] -63.0 1 1
[3] -57.0 1 1
[4] 73.0 0 0
[5] -33.0 1 1
[6] 100.0 0 0
[7] -54.0 1 1
[8] 78.0 0 0
[9] 48.0 0 0
>>> test_metrics = model.test(predicted_frame)
>>> test_metrics
accuracy = 1.0
confusion_matrix = Predicted_Pos Predicted_Neg
Actual_Pos 7 0
Actual_Neg 0 7
f_measure = 1.0
precision = 1.0
recall = 1.0
>>> model.save("sandbox/svm")
>>> restored = tc.load("sandbox/svm")
>>> restored.label_column == model.label_column
True
>>> restored.intercept == model.intercept
True
>>> set(restored.observation_columns) == set(model.observation_columns)
True
>>> predicted_frame2 = restored.predict(frame)
>>> predicted_frame2.inspect()
[#] data label predicted_label
==================================
[0] -48.0 1 1
[1] -75.0 1 1
[2] -63.0 1 1
[3] -57.0 1 1
[4] 73.0 0 0
[5] -33.0 1 1
[6] 100.0 0 0
[7] -54.0 1 1
[8] 78.0 0 0
[9] 48.0 0 0
>>> canonical_path = model.export_to_mar("sandbox/SVM.mar")
"""
def __init__(self, tc, scala_model):
self._tc = tc
tc.jutils.validate_is_jvm_instance_of(scala_model, get_scala_obj(tc))
self._scala = scala_model
@staticmethod
def _from_scala(tc, scala_model):
return SvmModel(tc, scala_model)
@property
def label_column(self):
"""column containing the label used during model training"""
return self._scala.labelColumn()
@property
def observation_columns(self):
"""columns containing the observation values used during model training"""
return self._tc.jutils.convert.from_scala_seq(self._scala.observationColumns())
@property
def intercept(self):
"""intercept used during model training"""
return self._scala.intercept()
@property
def num_iterations(self):
"""max number of iterations allowed during model training"""
return self._scala.numIterations()
@property
def step_size(self):
"""step size value used to train the model"""
return self._scala.stepSize()
@property
def reg_type(self):
"""regularization type used to train the model"""
return self._tc.jutils.convert.from_scala_option(self._scala.regType())
@property
def reg_param(self):
"""regularization parameter used to train the model"""
return self._scala.regParam()
@property
def mini_batch_fraction(self):
"""minimum batch fraction used to train the model"""
return self._scala.miniBatchFraction()
def predict(self, frame, columns=None):
"""
Predicts the labels for the observation columns in the given input frame. Creates a new frame
with the existing columns and a new predicted column.
Parameters
----------
:param frame: (Frame) Frame used for predicting the values
:param c: (List[str]) Names of the observation columns.
:return: (Frame) A new frame containing the original frame's columns and a prediction column
"""
c = self.__columns_to_option(columns)
from sparktk.frame.frame import Frame
return Frame(self._tc, self._scala.predict(frame._scala, c))
def test(self, frame, columns=None):
"""test the frame given the trained model"""
c = self.__columns_to_option(columns)
return ClassificationMetricsValue(self._tc, self._scala.test(frame._scala, c))
def __columns_to_option(self, c):
if c is not None:
c = self._tc.jutils.convert.to_scala_list_string(c)
return self._tc.jutils.convert.to_scala_option(c)
def save(self, path):
"""save the trained model to path"""
self._scala.save(self._tc._scala_sc, path)
def export_to_mar(self, path):
"""
Exports the trained model as a model archive (.mar) to the specified path
Parameters
----------
:param path: (str) Path to save the trained model
:return: (str) Full path to the saved .mar file
"""
if isinstance(path, basestring):
return self._scala.exportToMar(self._tc._scala_sc, path)
Ancestors (in MRO)
- SvmModel
- sparktk.propobj.PropertiesObject
- __builtin__.object
Instance variables
var intercept
intercept used during model training
var label_column
column containing the label used during model training
var mini_batch_fraction
minimum batch fraction used to train the model
var num_iterations
max number of iterations allowed during model training
var observation_columns
columns containing the observation values used during model training
var reg_param
regularization parameter used to train the model
var reg_type
regularization type used to train the model
var step_size
step size value used to train the model
Methods
def __init__(
self, tc, scala_model)
def __init__(self, tc, scala_model):
self._tc = tc
tc.jutils.validate_is_jvm_instance_of(scala_model, get_scala_obj(tc))
self._scala = scala_model
def export_to_mar(
self, path)
Exports the trained model as a model archive (.mar) to the specified path
path | (str): | Path to save the trained model |
Returns | (str): | Full path to the saved .mar file |
def export_to_mar(self, path):
"""
Exports the trained model as a model archive (.mar) to the specified path
Parameters
----------
:param path: (str) Path to save the trained model
:return: (str) Full path to the saved .mar file
"""
if isinstance(path, basestring):
return self._scala.exportToMar(self._tc._scala_sc, path)
def predict(
self, frame, columns=None)
Predicts the labels for the observation columns in the given input frame. Creates a new frame with the existing columns and a new predicted column.
frame | (Frame): | Frame used for predicting the values |
c | (List[str]): | Names of the observation columns. |
Returns | (Frame): | A new frame containing the original frame's columns and a prediction column |
def predict(self, frame, columns=None):
"""
Predicts the labels for the observation columns in the given input frame. Creates a new frame
with the existing columns and a new predicted column.
Parameters
----------
:param frame: (Frame) Frame used for predicting the values
:param c: (List[str]) Names of the observation columns.
:return: (Frame) A new frame containing the original frame's columns and a prediction column
"""
c = self.__columns_to_option(columns)
from sparktk.frame.frame import Frame
return Frame(self._tc, self._scala.predict(frame._scala, c))
def save(
self, path)
save the trained model to path
def save(self, path):
"""save the trained model to path"""
self._scala.save(self._tc._scala_sc, path)
def test(
self, frame, columns=None)
test the frame given the trained model
def test(self, frame, columns=None):
"""test the frame given the trained model"""
c = self.__columns_to_option(columns)
return ClassificationMetricsValue(self._tc, self._scala.test(frame._scala, c))
def to_dict(
self)
def to_dict(self):
d = self._properties()
d.update(self._attributes())
return d
def to_json(
self)
def to_json(self):
return json.dumps(self.to_dict())