sparktk.models.dimreduction.pca module
# vim: set encoding=utf-8
# Copyright (c) 2016 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from sparktk.loggers import log_load; log_load(__name__); del log_load
from sparktk.propobj import PropertiesObject
from sparktk import TkContext
__all__ = ["train", "load", "PcaModel"]
def train(frame, columns, mean_centered=True, k=None):
"""
Creates a PcaModel by training on the given frame
Parameters
----------
:param frame: (Frame) A frame of training data.
:param columns: (str or list[str]) Names of columns containing the observations for training.
:param mean_centered: (bool) Whether to mean center the columns.
:param k: (int) Principal component count. Default is the number of observation columns.
:return: (PcaModel) The trained PCA model
"""
if frame is None:
raise ValueError("frame cannot be None")
tc = frame._tc
_scala_obj = get_scala_obj(tc)
scala_columns = tc.jutils.convert.to_scala_vector_string(columns)
if not isinstance(mean_centered, bool):
raise ValueError("mean_centered must be a bool, received %s" % type(mean_centered))
scala_k = tc.jutils.convert.to_scala_option(k)
scala_model = _scala_obj.train(frame._scala, scala_columns, mean_centered, scala_k)
return PcaModel(tc, scala_model)
def load(path, tc=TkContext.implicit):
"""load PcaModel from given path"""
TkContext.validate(tc)
return tc.load(path, PcaModel)
def get_scala_obj(tc):
"""Gets reference to the scala object"""
return tc.sc._jvm.org.trustedanalytics.sparktk.models.dimreduction.pca.PcaModel
class PcaModel(PropertiesObject):
"""
Princiapl Component Analysis Model
Example
-------
>>> frame = tc.frame.create([[2.6,1.7,0.3,1.5,0.8,0.7],
... [3.3,1.8,0.4,0.7,0.9,0.8],
... [3.5,1.7,0.3,1.7,0.6,0.4],
... [3.7,1.0,0.5,1.2,0.6,0.3],
... [1.5,1.2,0.5,1.4,0.6,0.4]],
... [("1", float), ("2", float), ("3", float), ("4", float), ("5", float), ("6", float)])
-etc-
>>> frame.inspect()
[#] 1 2 3 4 5 6
=================================
[0] 2.6 1.7 0.3 1.5 0.8 0.7
[1] 3.3 1.8 0.4 0.7 0.9 0.8
[2] 3.5 1.7 0.3 1.7 0.6 0.4
[3] 3.7 1.0 0.5 1.2 0.6 0.3
[4] 1.5 1.2 0.5 1.4 0.6 0.4
>>> model = tc.models.dimreduction.pca.train(frame, ['1','2','3','4','5','6'], mean_centered=True, k=4)
>>> model.columns
[u'1', u'2', u'3', u'4', u'5', u'6']
>>> model.column_means
[2.92, 1.48, 0.4, 1.3, 0.7, 0.52]
>>> model.singular_values
[1.804817009663242, 0.8835344148403884, 0.7367461843294286, 0.15234027471064396]
>>> model.right_singular_vectors
[[-0.9906468642089336, 0.11801374544146298, 0.02564701035332026, 0.04852509627553534], [-0.07735139793384983, -0.6023104604841426, 0.6064054412059492, -0.4961696216881456], [0.028850639537397756, 0.07268697636708586, -0.24463936400591005, -0.17103491337994484], [0.10576208410025367, 0.5480329468552814, 0.7523059089872701, 0.2866144016081254], [-0.024072151446194616, -0.30472267167437644, -0.011259366445851784, 0.48934541040601887], [-0.00617295395184184, -0.47414707747028795, 0.0753345822621543, 0.6329307498105843]]
>>> predicted_frame = model.predict(frame, mean_centered=True, t_squared_index=True, columns=['1','2','3','4','5','6'], k=3)
-etc-
>>> predicted_frame.inspect()
[#] 1 2 3 4 5 6 p_1 p_2
===================================================================
[0] 1.5 1.2 0.5 1.4 0.6 0.4 1.44498618058 0.150509319195
[1] 2.6 1.7 0.3 1.5 0.8 0.7 0.314738695012 -0.183753549226
[2] 3.5 1.7 0.3 1.7 0.6 0.4 -0.549024749481 0.235254068619
[3] 3.3 1.8 0.4 0.7 0.9 0.8 -0.471198363594 -0.670419608227
[4] 3.7 1.0 0.5 1.2 0.6 0.3 -0.739501762517 0.468409769639
[#] p_3 t_squared_index
=====================================
[0] -0.163359836968 0.719188122813
[1] 0.312561560113 0.253649649849
[2] 0.465756549839 0.563086507007
[3] -0.228746130528 0.740327252782
[4] -0.386212142456 0.723748467549
>>> model.save('sandbox/pca1')
>>> model2 = tc.load('sandbox/pca1')
>>> model2.k
4
>>> predicted_frame2 = model2.predict(frame, mean_centered=True, t_squared_index=True, columns=['1','2','3','4','5','6'], k=3)
>>> predicted_frame2.inspect()
[#] 1 2 3 4 5 6 p_1 p_2
===================================================================
[0] 1.5 1.2 0.5 1.4 0.6 0.4 1.44498618058 0.150509319195
[1] 2.6 1.7 0.3 1.5 0.8 0.7 0.314738695012 -0.183753549226
[2] 3.5 1.7 0.3 1.7 0.6 0.4 -0.549024749481 0.235254068619
[3] 3.3 1.8 0.4 0.7 0.9 0.8 -0.471198363594 -0.670419608227
[4] 3.7 1.0 0.5 1.2 0.6 0.3 -0.739501762517 0.468409769639
[#] p_3 t_squared_index
=====================================
[0] -0.163359836968 0.719188122813
[1] 0.312561560113 0.253649649849
[2] 0.465756549839 0.563086507007
[3] -0.228746130528 0.740327252782
[4] -0.386212142456 0.723748467549
>>> canonical_path = model.export_to_mar("sandbox/Kmeans.mar")
"""
def __init__(self, tc, scala_model):
self._tc = tc
tc.jutils.validate_is_jvm_instance_of(scala_model, get_scala_obj(tc))
self._scala = scala_model
@staticmethod
def _from_scala(tc, scala_model):
return PcaModel(tc, scala_model)
@property
def columns(self):
return list(self._tc.jutils.convert.from_scala_seq(self._scala.columns()))
@property
def mean_centered(self):
return self._scala.meanCentered()
@property
def k(self):
return self._scala.k()
@property
def column_means(self):
return list(self._scala.columnMeansAsArray())
@property
def singular_values(self):
return list(self._scala.singularValuesAsArray())
@property
def right_singular_vectors(self):
x = list(self._scala.rightSingularVectorsAsArray())
return [x[i:i+self.k] for i in xrange(0,len(x),self.k)]
def predict(self, frame, columns=None, mean_centered=None, k=None, t_squared_index=False):
"""
Predicts the labels for the observation columns in the given input frame. Creates a new frame
with the existing columns and a new predicted column.
Parameters
----------
:param frame: (Frame) Frame used for predicting the values
:param columns: (List[str]) Names of the observation columns.
:param mean_centered: (boolean) whether to mean center the columns. Default is true
:param k: (int) the number of principal components to be computed, must be <= the k used in training. Default is the trained k
:param t_squared_index: (boolean) whether the t-square index is to be computed. Default is false
:return: (Frame) A new frame containing the original frame's columns and a prediction column
"""
if mean_centered is None:
mean_centered = self.mean_centered
from sparktk.frame.frame import Frame
return Frame(self._tc, self._scala.predict(frame._scala,
self._tc.jutils.convert.to_scala_option_list_string(columns),
mean_centered,
self._tc.jutils.convert.to_scala_option(k),
t_squared_index))
def save(self, path):
self._scala.save(self._tc._scala_sc, path)
def export_to_mar(self, path):
"""
Exports the trained model as a model archive (.mar) to the specified path
Parameters
----------
:param path: (str) Path to save the trained model
:return: (str) Full path to the saved .mar file
"""
if isinstance(path, basestring):
return self._scala.exportToMar(self._tc._scala_sc, path)
Functions
def load(
path, tc=<class 'sparktk.arguments.implicit'>)
load PcaModel from given path
def load(path, tc=TkContext.implicit):
"""load PcaModel from given path"""
TkContext.validate(tc)
return tc.load(path, PcaModel)
def train(
frame, columns, mean_centered=True, k=None)
Creates a PcaModel by training on the given frame
frame | (Frame): | A frame of training data. |
columns | (str or list[str]): | Names of columns containing the observations for training. |
mean_centered | (bool): | Whether to mean center the columns. |
k | (int): | Principal component count. Default is the number of observation columns. |
Returns | (PcaModel): | The trained PCA model |
def train(frame, columns, mean_centered=True, k=None):
"""
Creates a PcaModel by training on the given frame
Parameters
----------
:param frame: (Frame) A frame of training data.
:param columns: (str or list[str]) Names of columns containing the observations for training.
:param mean_centered: (bool) Whether to mean center the columns.
:param k: (int) Principal component count. Default is the number of observation columns.
:return: (PcaModel) The trained PCA model
"""
if frame is None:
raise ValueError("frame cannot be None")
tc = frame._tc
_scala_obj = get_scala_obj(tc)
scala_columns = tc.jutils.convert.to_scala_vector_string(columns)
if not isinstance(mean_centered, bool):
raise ValueError("mean_centered must be a bool, received %s" % type(mean_centered))
scala_k = tc.jutils.convert.to_scala_option(k)
scala_model = _scala_obj.train(frame._scala, scala_columns, mean_centered, scala_k)
return PcaModel(tc, scala_model)
Classes
class PcaModel
Princiapl Component Analysis Model
>>> frame = tc.frame.create([[2.6,1.7,0.3,1.5,0.8,0.7],
... [3.3,1.8,0.4,0.7,0.9,0.8],
... [3.5,1.7,0.3,1.7,0.6,0.4],
... [3.7,1.0,0.5,1.2,0.6,0.3],
... [1.5,1.2,0.5,1.4,0.6,0.4]],
... [("1", float), ("2", float), ("3", float), ("4", float), ("5", float), ("6", float)])
-etc-
>>> frame.inspect()
[#] 1 2 3 4 5 6
=================================
[0] 2.6 1.7 0.3 1.5 0.8 0.7
[1] 3.3 1.8 0.4 0.7 0.9 0.8
[2] 3.5 1.7 0.3 1.7 0.6 0.4
[3] 3.7 1.0 0.5 1.2 0.6 0.3
[4] 1.5 1.2 0.5 1.4 0.6 0.4
>>> model = tc.models.dimreduction.pca.train(frame, ['1','2','3','4','5','6'], mean_centered=True, k=4)
>>> model.columns
[u'1', u'2', u'3', u'4', u'5', u'6']
>>> model.column_means
[2.92, 1.48, 0.4, 1.3, 0.7, 0.52]
>>> model.singular_values
[1.804817009663242, 0.8835344148403884, 0.7367461843294286, 0.15234027471064396]
>>> model.right_singular_vectors
[[-0.9906468642089336, 0.11801374544146298, 0.02564701035332026, 0.04852509627553534], [-0.07735139793384983, -0.6023104604841426, 0.6064054412059492, -0.4961696216881456], [0.028850639537397756, 0.07268697636708586, -0.24463936400591005, -0.17103491337994484], [0.10576208410025367, 0.5480329468552814, 0.7523059089872701, 0.2866144016081254], [-0.024072151446194616, -0.30472267167437644, -0.011259366445851784, 0.48934541040601887], [-0.00617295395184184, -0.47414707747028795, 0.0753345822621543, 0.6329307498105843]]
>>> predicted_frame = model.predict(frame, mean_centered=True, t_squared_index=True, columns=['1','2','3','4','5','6'], k=3)
-etc-
>>> predicted_frame.inspect()
[#] 1 2 3 4 5 6 p_1 p_2
===================================================================
[0] 1.5 1.2 0.5 1.4 0.6 0.4 1.44498618058 0.150509319195
[1] 2.6 1.7 0.3 1.5 0.8 0.7 0.314738695012 -0.183753549226
[2] 3.5 1.7 0.3 1.7 0.6 0.4 -0.549024749481 0.235254068619
[3] 3.3 1.8 0.4 0.7 0.9 0.8 -0.471198363594 -0.670419608227
[4] 3.7 1.0 0.5 1.2 0.6 0.3 -0.739501762517 0.468409769639
<BLANKLINE>
[#] p_3 t_squared_index
=====================================
[0] -0.163359836968 0.719188122813
[1] 0.312561560113 0.253649649849
[2] 0.465756549839 0.563086507007
[3] -0.228746130528 0.740327252782
[4] -0.386212142456 0.723748467549
>>> model.save('sandbox/pca1')
>>> model2 = tc.load('sandbox/pca1')
>>> model2.k
4
>>> predicted_frame2 = model2.predict(frame, mean_centered=True, t_squared_index=True, columns=['1','2','3','4','5','6'], k=3)
>>> predicted_frame2.inspect()
[#] 1 2 3 4 5 6 p_1 p_2
===================================================================
[0] 1.5 1.2 0.5 1.4 0.6 0.4 1.44498618058 0.150509319195
[1] 2.6 1.7 0.3 1.5 0.8 0.7 0.314738695012 -0.183753549226
[2] 3.5 1.7 0.3 1.7 0.6 0.4 -0.549024749481 0.235254068619
[3] 3.3 1.8 0.4 0.7 0.9 0.8 -0.471198363594 -0.670419608227
[4] 3.7 1.0 0.5 1.2 0.6 0.3 -0.739501762517 0.468409769639
<BLANKLINE>
[#] p_3 t_squared_index
=====================================
[0] -0.163359836968 0.719188122813
[1] 0.312561560113 0.253649649849
[2] 0.465756549839 0.563086507007
[3] -0.228746130528 0.740327252782
[4] -0.386212142456 0.723748467549
>>> canonical_path = model.export_to_mar("sandbox/Kmeans.mar")
class PcaModel(PropertiesObject):
"""
Princiapl Component Analysis Model
Example
-------
>>> frame = tc.frame.create([[2.6,1.7,0.3,1.5,0.8,0.7],
... [3.3,1.8,0.4,0.7,0.9,0.8],
... [3.5,1.7,0.3,1.7,0.6,0.4],
... [3.7,1.0,0.5,1.2,0.6,0.3],
... [1.5,1.2,0.5,1.4,0.6,0.4]],
... [("1", float), ("2", float), ("3", float), ("4", float), ("5", float), ("6", float)])
-etc-
>>> frame.inspect()
[#] 1 2 3 4 5 6
=================================
[0] 2.6 1.7 0.3 1.5 0.8 0.7
[1] 3.3 1.8 0.4 0.7 0.9 0.8
[2] 3.5 1.7 0.3 1.7 0.6 0.4
[3] 3.7 1.0 0.5 1.2 0.6 0.3
[4] 1.5 1.2 0.5 1.4 0.6 0.4
>>> model = tc.models.dimreduction.pca.train(frame, ['1','2','3','4','5','6'], mean_centered=True, k=4)
>>> model.columns
[u'1', u'2', u'3', u'4', u'5', u'6']
>>> model.column_means
[2.92, 1.48, 0.4, 1.3, 0.7, 0.52]
>>> model.singular_values
[1.804817009663242, 0.8835344148403884, 0.7367461843294286, 0.15234027471064396]
>>> model.right_singular_vectors
[[-0.9906468642089336, 0.11801374544146298, 0.02564701035332026, 0.04852509627553534], [-0.07735139793384983, -0.6023104604841426, 0.6064054412059492, -0.4961696216881456], [0.028850639537397756, 0.07268697636708586, -0.24463936400591005, -0.17103491337994484], [0.10576208410025367, 0.5480329468552814, 0.7523059089872701, 0.2866144016081254], [-0.024072151446194616, -0.30472267167437644, -0.011259366445851784, 0.48934541040601887], [-0.00617295395184184, -0.47414707747028795, 0.0753345822621543, 0.6329307498105843]]
>>> predicted_frame = model.predict(frame, mean_centered=True, t_squared_index=True, columns=['1','2','3','4','5','6'], k=3)
-etc-
>>> predicted_frame.inspect()
[#] 1 2 3 4 5 6 p_1 p_2
===================================================================
[0] 1.5 1.2 0.5 1.4 0.6 0.4 1.44498618058 0.150509319195
[1] 2.6 1.7 0.3 1.5 0.8 0.7 0.314738695012 -0.183753549226
[2] 3.5 1.7 0.3 1.7 0.6 0.4 -0.549024749481 0.235254068619
[3] 3.3 1.8 0.4 0.7 0.9 0.8 -0.471198363594 -0.670419608227
[4] 3.7 1.0 0.5 1.2 0.6 0.3 -0.739501762517 0.468409769639
[#] p_3 t_squared_index
=====================================
[0] -0.163359836968 0.719188122813
[1] 0.312561560113 0.253649649849
[2] 0.465756549839 0.563086507007
[3] -0.228746130528 0.740327252782
[4] -0.386212142456 0.723748467549
>>> model.save('sandbox/pca1')
>>> model2 = tc.load('sandbox/pca1')
>>> model2.k
4
>>> predicted_frame2 = model2.predict(frame, mean_centered=True, t_squared_index=True, columns=['1','2','3','4','5','6'], k=3)
>>> predicted_frame2.inspect()
[#] 1 2 3 4 5 6 p_1 p_2
===================================================================
[0] 1.5 1.2 0.5 1.4 0.6 0.4 1.44498618058 0.150509319195
[1] 2.6 1.7 0.3 1.5 0.8 0.7 0.314738695012 -0.183753549226
[2] 3.5 1.7 0.3 1.7 0.6 0.4 -0.549024749481 0.235254068619
[3] 3.3 1.8 0.4 0.7 0.9 0.8 -0.471198363594 -0.670419608227
[4] 3.7 1.0 0.5 1.2 0.6 0.3 -0.739501762517 0.468409769639
[#] p_3 t_squared_index
=====================================
[0] -0.163359836968 0.719188122813
[1] 0.312561560113 0.253649649849
[2] 0.465756549839 0.563086507007
[3] -0.228746130528 0.740327252782
[4] -0.386212142456 0.723748467549
>>> canonical_path = model.export_to_mar("sandbox/Kmeans.mar")
"""
def __init__(self, tc, scala_model):
self._tc = tc
tc.jutils.validate_is_jvm_instance_of(scala_model, get_scala_obj(tc))
self._scala = scala_model
@staticmethod
def _from_scala(tc, scala_model):
return PcaModel(tc, scala_model)
@property
def columns(self):
return list(self._tc.jutils.convert.from_scala_seq(self._scala.columns()))
@property
def mean_centered(self):
return self._scala.meanCentered()
@property
def k(self):
return self._scala.k()
@property
def column_means(self):
return list(self._scala.columnMeansAsArray())
@property
def singular_values(self):
return list(self._scala.singularValuesAsArray())
@property
def right_singular_vectors(self):
x = list(self._scala.rightSingularVectorsAsArray())
return [x[i:i+self.k] for i in xrange(0,len(x),self.k)]
def predict(self, frame, columns=None, mean_centered=None, k=None, t_squared_index=False):
"""
Predicts the labels for the observation columns in the given input frame. Creates a new frame
with the existing columns and a new predicted column.
Parameters
----------
:param frame: (Frame) Frame used for predicting the values
:param columns: (List[str]) Names of the observation columns.
:param mean_centered: (boolean) whether to mean center the columns. Default is true
:param k: (int) the number of principal components to be computed, must be <= the k used in training. Default is the trained k
:param t_squared_index: (boolean) whether the t-square index is to be computed. Default is false
:return: (Frame) A new frame containing the original frame's columns and a prediction column
"""
if mean_centered is None:
mean_centered = self.mean_centered
from sparktk.frame.frame import Frame
return Frame(self._tc, self._scala.predict(frame._scala,
self._tc.jutils.convert.to_scala_option_list_string(columns),
mean_centered,
self._tc.jutils.convert.to_scala_option(k),
t_squared_index))
def save(self, path):
self._scala.save(self._tc._scala_sc, path)
def export_to_mar(self, path):
"""
Exports the trained model as a model archive (.mar) to the specified path
Parameters
----------
:param path: (str) Path to save the trained model
:return: (str) Full path to the saved .mar file
"""
if isinstance(path, basestring):
return self._scala.exportToMar(self._tc._scala_sc, path)
Ancestors (in MRO)
- PcaModel
- sparktk.propobj.PropertiesObject
- __builtin__.object
Instance variables
var column_means
var columns
var k
var mean_centered
var right_singular_vectors
var singular_values
Methods
def __init__(
self, tc, scala_model)
def __init__(self, tc, scala_model):
self._tc = tc
tc.jutils.validate_is_jvm_instance_of(scala_model, get_scala_obj(tc))
self._scala = scala_model
def export_to_mar(
self, path)
Exports the trained model as a model archive (.mar) to the specified path
path | (str): | Path to save the trained model |
Returns | (str): | Full path to the saved .mar file |
def export_to_mar(self, path):
"""
Exports the trained model as a model archive (.mar) to the specified path
Parameters
----------
:param path: (str) Path to save the trained model
:return: (str) Full path to the saved .mar file
"""
if isinstance(path, basestring):
return self._scala.exportToMar(self._tc._scala_sc, path)
def predict(
self, frame, columns=None, mean_centered=None, k=None, t_squared_index=False)
Predicts the labels for the observation columns in the given input frame. Creates a new frame with the existing columns and a new predicted column.
frame | (Frame): | Frame used for predicting the values |
columns | (List[str]): | Names of the observation columns. |
mean_centered | (boolean): | whether to mean center the columns. Default is true |
k | (int): | the number of principal components to be computed, must be <= the k used in training. Default is the trained k |
t_squared_index | (boolean): | whether the t-square index is to be computed. Default is false |
Returns | (Frame): | A new frame containing the original frame's columns and a prediction column |
def predict(self, frame, columns=None, mean_centered=None, k=None, t_squared_index=False):
"""
Predicts the labels for the observation columns in the given input frame. Creates a new frame
with the existing columns and a new predicted column.
Parameters
----------
:param frame: (Frame) Frame used for predicting the values
:param columns: (List[str]) Names of the observation columns.
:param mean_centered: (boolean) whether to mean center the columns. Default is true
:param k: (int) the number of principal components to be computed, must be <= the k used in training. Default is the trained k
:param t_squared_index: (boolean) whether the t-square index is to be computed. Default is false
:return: (Frame) A new frame containing the original frame's columns and a prediction column
"""
if mean_centered is None:
mean_centered = self.mean_centered
from sparktk.frame.frame import Frame
return Frame(self._tc, self._scala.predict(frame._scala,
self._tc.jutils.convert.to_scala_option_list_string(columns),
mean_centered,
self._tc.jutils.convert.to_scala_option(k),
t_squared_index))
def save(
self, path)
def save(self, path):
self._scala.save(self._tc._scala_sc, path)
def to_dict(
self)
def to_dict(self):
d = self._properties()
d.update(self._attributes())
return d
def to_json(
self)
def to_json(self):
return json.dumps(self.to_dict())