sparktk.frame.frame module
# vim: set encoding=utf-8
# Copyright (c) 2016 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from pyspark.rdd import RDD
from pyspark.sql import DataFrame
from sparktk.frame.pyframe import PythonFrame
from sparktk.frame.schema import schema_to_python, schema_to_scala
from sparktk import dtypes
import logging
logger = logging.getLogger('sparktk')
from sparktk.propobj import PropertiesObject
from sparktk import TkContext
# import constructors for the API's sake (not actually dependencies of the Frame class)
from sparktk.frame.constructors.create import create
from sparktk.frame.constructors.import_csv import import_csv
from sparktk.frame.constructors.import_hbase import import_hbase
from sparktk.frame.constructors.import_hive import import_hive
from sparktk.frame.constructors.import_jdbc import import_jdbc
from sparktk.frame.constructors.import_pandas import import_pandas
__all__ = ["create",
"Frame",
"import_csv",
"import_hbase",
"import_hive",
"import_jdbc",
"import_pandas",
"load"]
class Frame(object):
def __init__(self, tc, source, schema=None, validate_schema=False):
"""(Private constructor -- use tc.frame.create or other methods available from the TkContext)"""
self._tc = tc
if self._is_scala_frame(source):
self._frame = source
elif self._is_scala_rdd(source):
scala_schema = schema_to_scala(tc.sc, schema)
self._frame = self._create_scala_frame(tc.sc, source, scala_schema)
elif self._is_scala_dataframe(source):
self._frame = self._create_scala_frame_from_scala_dataframe(tc.sc, source)
elif isinstance(source, DataFrame):
self._frame = self._create_scala_frame_from_scala_dataframe(tc.sc, source._jdf)
elif isinstance(source, PythonFrame):
self._frame = source
else:
if not isinstance(source, RDD):
if not isinstance(source, list) or (len(source) > 0 and any(not isinstance(row, (list, tuple)) for row in source)):
raise TypeError("Invalid data source. The data parameter must be a 2-dimensional list (list of row data) or an RDD.")
inferred_schema = False
if isinstance(schema, list):
if all(isinstance(item, basestring) for item in schema):
# check if schema is just a list of column names (versus string and data type tuples)
schema = self._infer_schema(source, schema)
inferred_schema = True
elif not all(isinstance(item, tuple) and
len(item) == 2 and
isinstance(item[0], basestring) for item in schema):
raise TypeError("Invalid schema. Expected a list of tuples (str, type) with the column name and data type, but received type %s." % type(schema))
# check for duplicate column names
column_names = [col[0] for col in schema]
duplicate_column_names = set([col for col in column_names if column_names.count(col) > 1])
if len(duplicate_column_names) > 0:
raise ValueError("Invalid schema, column names cannot be duplicated: %s" % ", ".join(duplicate_column_names))
elif schema is None:
schema = self._infer_schema(source)
inferred_schema = True
else:
# Schema is not a list or None
raise TypeError("Invalid schema type: %s. Expected a list of tuples (str, type) with the column name and data type." % type(schema))
for item in schema:
if not self._is_supported_datatype(item[1]):
if inferred_schema:
raise TypeError("The %s data type was found when inferring the schema, and it is not a "
"supported data type. Instead, specify a schema that uses a supported data "
"type, and enable validate_schema so that the data is converted to the proper "
"data type.\n\nInferred schema: %s\n\nSupported data types: %s" %
(str(item[1]), str(schema), dtypes.dtypes))
else:
raise TypeError("Invalid schema. %s is not a supported data type.\n\nSupported data types: %s" %
(str(item[1]), dtypes.dtypes))
source = tc.sc.parallelize(source)
if schema and validate_schema:
# Validate schema by going through the data and checking the data type and attempting to parse it
validate_schema_result = self.validate_pyrdd_schema(source, schema)
source = validate_schema_result.validated_rdd
logger.debug("%s values were unable to be parsed to the schema's data type." % validate_schema_result.bad_value_count)
# If schema contains matrix datatype, then apply type_coercer to convert list[list] to numpy ndarray
map_source = MatrixCoercion.schema_is_coercible(source, list(schema))
self._frame = PythonFrame(map_source, schema)
def _merge_types(self, type_list_a, type_list_b):
"""
Merges two lists of data types
:param type_list_a: First list of data types to merge
:param type_list_b: Second list of data types to merge
:return: List of merged data types
"""
if not isinstance(type_list_a, list) or not isinstance(type_list_b, list):
raise TypeError("Unable to generate schema, because schema is not a list.")
if len(type_list_a) != len(type_list_b):
raise ValueError("Length of each row must be the same (found rows with lengths: %s and %s)." % (len(type_list_a), len(type_list_b)))
return [dtypes._DataTypes.merge_types(type_list_a[i], type_list_b[i]) for i in xrange(0, len(type_list_a))]
def _infer_types_for_row(self, row):
"""
Returns a list of data types for the data in the specified row
:param row: List or Row of data
:return: List of data types
"""
inferred_types = []
for item in row:
if item is None:
inferred_types.append(int)
elif not isinstance(item, list):
inferred_types.append(type(item))
else:
inferred_types.append(dtypes.vector((len(item))))
return inferred_types
def _infer_schema(self, data, column_names=[], sample_size=100):
"""
Infers the schema based on the data in the RDD.
:param sc: Spark Context
:param data: Data used to infer schema
:param column_names: Optional column names to use in the schema. If no column names are provided, columns
are given numbered names. If there are more columns in the RDD than there are in the
column_names list, remaining columns will be numbered.
:param sample_size: Number of rows to check when inferring the schema. Defaults to 100.
:return: Schema
"""
inferred_schema = []
if isinstance(data, list):
if len(data) > 0:
# get the schema for the first row
data_types = self._infer_types_for_row(data[0])
sample_size = min(sample_size, len(data))
for i in xrange (1, sample_size):
data_types = self._merge_types(data_types, self._infer_types_for_row(data[i]))
for i, data_type in enumerate(data_types):
column_name = "C%s" % i
if len(column_names) > i:
column_name = column_names[i]
inferred_schema.append((column_name, data_type))
else:
raise TypeError("Unable to infer schema, because the data provided is not a list.")
return inferred_schema
def _is_supported_datatype(self, data_type):
"""
Returns True if the specified data_type is supported.
"""
supported_primitives = [int, float, long, str, unicode]
if data_type in supported_primitives:
return True
elif data_type is dtypes.datetime:
return True
elif type(data_type) is dtypes.vector:
return True
elif data_type is dtypes.matrix:
return True
else:
return False
def validate_pyrdd_schema(self, pyrdd, schema):
if isinstance(pyrdd, RDD):
schema_length = len(schema)
num_bad_values = self._tc.sc.accumulator(0)
def validate_schema(row, accumulator):
data = []
if len(row) != schema_length:
raise ValueError("Length of the row (%s) does not match the schema length (%s)." % (len(row), len(schema)))
for index, column in enumerate(schema):
data_type = column[1]
try:
if row[index] is not None:
data.append(dtypes.dtypes.cast(row[index], data_type))
except:
data.append(None)
accumulator += 1
return data
validated_rdd = pyrdd.map(lambda row: validate_schema(row, num_bad_values))
# Force rdd to load, so that we can get a bad value count
validated_rdd.count()
return SchemaValidationReturn(validated_rdd, num_bad_values.value)
else:
raise TypeError("Unable to validate schema, because the pyrdd provided is not an RDD.")
@staticmethod
def _create_scala_frame(sc, scala_rdd, scala_schema):
"""call constructor in JVM"""
return sc._jvm.org.trustedanalytics.sparktk.frame.Frame(scala_rdd, scala_schema, False)
@staticmethod
def _create_scala_frame_from_scala_dataframe(sc, scala_dataframe):
"""call constructor in JVM"""
return sc._jvm.org.trustedanalytics.sparktk.frame.Frame(scala_dataframe)
@staticmethod
def _from_scala(tc, scala_frame):
"""creates a python Frame for the given scala Frame"""
return Frame(tc, scala_frame)
def _frame_to_scala(self, python_frame):
"""converts a PythonFrame to a Scala Frame"""
scala_schema = schema_to_scala(self._tc.sc, python_frame.schema)
scala_rdd = self._tc.sc._jvm.org.trustedanalytics.sparktk.frame.rdd.PythonJavaRdd.pythonToScala(python_frame.rdd._jrdd, scala_schema)
return self._create_scala_frame(self._tc.sc, scala_rdd, scala_schema)
def _is_scala_frame(self, item):
return self._tc._jutils.is_jvm_instance_of(item, self._tc.sc._jvm.org.trustedanalytics.sparktk.frame.Frame)
def _is_scala_rdd(self, item):
return self._tc._jutils.is_jvm_instance_of(item, self._tc.sc._jvm.org.apache.spark.rdd.RDD)
def _is_scala_dataframe(self, item):
return self._tc._jutils.is_jvm_instance_of(item, self._tc.sc._jvm.org.apache.spark.sql.DataFrame)
def _is_python_rdd(self, item):
return isinstance(item, RDD)
@property
def _is_scala(self):
"""answers whether the current frame is backed by a Scala Frame"""
return self._is_scala_frame(self._frame)
@property
def _is_python(self):
"""answers whether the current frame is backed by a _PythonFrame"""
return not self._is_scala
@property
def _scala(self):
"""gets frame backend as Scala Frame, causes conversion if it is current not"""
if self._is_python:
# If schema contains matrix dataype,
# then apply type_coercer_pymlib to convert ndarray to pymlib DenseMatrix for serialization purpose at java
self._frame.rdd = MatrixCoercion.schema_is_coercible(self._frame.rdd, list(self._frame.schema), True)
# convert PythonFrame to a Scala Frame"""
scala_schema = schema_to_scala(self._tc.sc, self._frame.schema)
scala_rdd = self._tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.pythonToScala(self._frame.rdd._jrdd, scala_schema)
self._frame = self._create_scala_frame(self._tc.sc, scala_rdd, scala_schema)
return self._frame
@property
def _python(self):
"""gets frame backend as _PythonFrame, causes conversion if it is current not"""
if self._is_scala:
# convert Scala Frame to a PythonFrame"""
scala_schema = self._frame.schema()
java_rdd = self._tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.scalaToPython(self._frame.rdd())
python_schema = schema_to_python(self._tc.sc, scala_schema)
python_rdd = RDD(java_rdd, self._tc.sc)
# If schema contains matrix datatype, then apply type_coercer to convert list[list] to numpy ndarray
map_python_rdd = MatrixCoercion.schema_is_coercible(python_rdd, list(python_schema))
self._frame = PythonFrame(map_python_rdd, python_schema)
return self._frame
##########################################################################
# API
##########################################################################
@property
def rdd(self):
"""pyspark RDD (causes conversion if currently backed by a Scala RDD)"""
return self._python.rdd
@property
def dataframe(self):
"""pyspark DataFrame (causes conversion through Scala)"""
return DataFrame(self._scala.dataframe(), self._tc.sql_context)
@property
def schema(self):
if self._is_scala:
return schema_to_python(self._tc.sc, self._frame.schema()) # need ()'s on schema because it's a def in scala
return self._frame.schema
@property
def column_names(self):
"""
Column identifications in the current frame.
:return: list of names of all the frame's columns
Returns the names of the columns of the current frame.
Examples
--------
>>> frame.column_names
[u'name', u'age', u'tenure', u'phone']
"""
return [name for name, data_type in self.schema]
# Frame Operations
from sparktk.frame.ops.add_columns import add_columns
from sparktk.frame.ops.append import append
from sparktk.frame.ops.assign_sample import assign_sample
from sparktk.frame.ops.bin_column import bin_column
from sparktk.frame.ops.binary_classification_metrics import binary_classification_metrics
from sparktk.frame.ops.box_cox import box_cox
from sparktk.frame.ops.categorical_summary import categorical_summary
from sparktk.frame.ops.collect import collect
from sparktk.frame.ops.column_median import column_median
from sparktk.frame.ops.column_mode import column_mode
from sparktk.frame.ops.column_summary_statistics import column_summary_statistics
from sparktk.frame.ops.copy import copy
from sparktk.frame.ops.correlation import correlation
from sparktk.frame.ops.correlation_matrix import correlation_matrix
from sparktk.frame.ops.count import count
from sparktk.frame.ops.covariance import covariance
from sparktk.frame.ops.covariance_matrix import covariance_matrix
from sparktk.frame.ops.cumulative_percent import cumulative_percent
from sparktk.frame.ops.cumulative_sum import cumulative_sum
from sparktk.frame.ops.dot_product import dot_product
from sparktk.frame.ops.drop_columns import drop_columns
from sparktk.frame.ops.drop_duplicates import drop_duplicates
from sparktk.frame.ops.drop_rows import drop_rows
from sparktk.frame.ops.ecdf import ecdf
from sparktk.frame.ops.entropy import entropy
from sparktk.frame.ops.export_data import export_to_csv, export_to_jdbc, export_to_json, export_to_hbase, export_to_hive
from sparktk.frame.ops.filter import filter
from sparktk.frame.ops.flatten_columns import flatten_columns
from sparktk.frame.ops.group_by import group_by
from sparktk.frame.ops.histogram import histogram
from sparktk.frame.ops.inspect import inspect
from sparktk.frame.ops.join_inner import join_inner
from sparktk.frame.ops.join_left import join_left
from sparktk.frame.ops.join_right import join_right
from sparktk.frame.ops.join_outer import join_outer
from sparktk.frame.ops.map_columns import map_columns
from sparktk.frame.ops.matrix_covariance_matrix import matrix_covariance_matrix
from sparktk.frame.ops.matrix_pca import matrix_pca
from sparktk.frame.ops.matrix_svd import matrix_svd
from sparktk.frame.ops.multiclass_classification_metrics import multiclass_classification_metrics
from sparktk.frame.ops.power_iteration_clustering import power_iteration_clustering
from sparktk.frame.ops.quantile_bin_column import quantile_bin_column
from sparktk.frame.ops.quantiles import quantiles
from sparktk.frame.ops.rename_columns import rename_columns
from sparktk.frame.ops.reverse_box_cox import reverse_box_cox
from sparktk.frame.ops.save import save
from sparktk.frame.ops.sort import sort
from sparktk.frame.ops.sortedk import sorted_k
from sparktk.frame.ops.take import take
from sparktk.frame.ops.tally import tally
from sparktk.frame.ops.tally_percent import tally_percent
from sparktk.frame.ops.timeseries_augmented_dickey_fuller_test import timeseries_augmented_dickey_fuller_test
from sparktk.frame.ops.timeseries_breusch_godfrey_test import timeseries_breusch_godfrey_test
from sparktk.frame.ops.timeseries_breusch_pagan_test import timeseries_breusch_pagan_test
from sparktk.frame.ops.timeseries_durbin_watson_test import timeseries_durbin_watson_test
from sparktk.frame.ops.timeseries_from_observations import timeseries_from_observations
from sparktk.frame.ops.timeseries_slice import timeseries_slice
from sparktk.frame.ops.to_pandas import to_pandas
from sparktk.frame.ops.topk import top_k
from sparktk.frame.ops.unflatten_columns import unflatten_columns
def load(path, tc=TkContext.implicit):
"""load Frame from given path"""
TkContext.validate(tc)
return tc.load(path, Frame)
class SchemaValidationReturn(PropertiesObject):
"""
Return value from schema validation that includes the rdd of validated values and the number of bad values
that were found.
"""
def __init__(self, validated_rdd, bad_value_count):
self._validated_rdd = validated_rdd
self._bad_value_count = bad_value_count
@property
def validated_rdd(self):
"""
RDD of values that have been casted to the data type specified by the frame's schema.
"""
return self._validated_rdd
@property
def bad_value_count(self):
"""
Number of values that were unable to be parsed to the data type specified by the schema.
"""
return self._bad_value_count
class MatrixCoercion(object):
@staticmethod
def schema_is_coercible(source, python_schema, in_scala=False):
"""
check whether python schema is coercible or not.
Like if schema contains matrix datatype, convert list[list] to numpy ndarray
"""
flag = False
for schema in python_schema:
if type(schema[1]) == dtypes._Matrix:
flag = True
break
if flag:
if in_scala:
map_source = source.map(MatrixCoercion.type_coercer_pymllib(python_schema))
else:
map_source = source.map(MatrixCoercion.type_coercer(python_schema))
else:
map_source = source
return map_source
@staticmethod
def type_coercer(schema):
"""
When creating a new frame(python frame created) or converting frame from scala to python frame,
the function scans a row and performs below
* when creating a new frame(python frame created) - if it finds list[list](which represents matrix) as column value,
converts it to numpy ndarray
* when Converting frame from scala to python frame - (scala converts DenseMatrix--> JList[JList[Double]](in JConvert.scala),
jconvert.py converts JList[JList[Double]] --> list[list[float64]]), converts list[list] to ndarray
"""
def decorator(row):
result = []
import numpy as np
for i in xrange(len(schema)):
if type(schema[i][1]) == dtypes._Matrix:
if isinstance(row[i], list):
result.append(np.array(row[i], dtype=np.float64))
else:
result.append(row[i])
else:
result.append(row[i])
return result
return decorator
@staticmethod
def type_coercer_pymllib(schema):
"""
When converting from python to scala, function scans the row and converts the ndarray
to python mllib DenseMatrix. so that autopicklers understands how to serialize from pyspark mllib DenseMatrix to Scala MLlib DenseMatrix.
For Serialization to work we have to explicitly call SparkAliases.getSparkMLLibSerDe in pythonToScala() method of PythonJavaRdd.scala class
ndarray stores data as row-major where as mllib densematrix stores data as column-major.
To construct mllib DenseMatrix with row-major we are setting isTransposed=True.
"""
def decorator(row):
result = []
from pyspark.mllib.linalg import DenseMatrix
for i in xrange(len(schema)):
if type(schema[i][1]) == dtypes._Matrix:
shape = row[i].shape
arr=row[i].flatten()
# By default Mllib DenseMatrix constructs column-major matrix.
# Setting isTranposed=True, will construct row-major DenseMatrix
dm = DenseMatrix(shape[0], shape[1], arr, isTransposed=True)
result.append(dm)
else:
result.append(row[i])
return result
return decorator
Functions
def create(
data, schema=None, validate_schema=False, tc=<class 'sparktk.arguments.implicit'>)
Creates a frame from the given data and schema. If no schema data types are provided, the schema is inferred based on the data in the first 100 rows.
If schema validation is enabled, all data is is checked to ensure that it matches the schema. If the data does not match the schema's data type, it attempts to cast the data to the proper data type. When the data is unable to be casted to the schema's data type, the item will be missing (None) in the frame.
data | (List of row data or RDD): | Data source |
schema | (Optional(list[tuple(str, type)] or list[str])] Optionally specify a schema (list of tuples of string column names and data type), column names (list of strings, and the column data types will be inferred): | or None (column data types will be inferred and column names will be numbered like C0, C1, C2, etc). |
validate_schema | (Optional(bool)): | When True, all data is checked to ensure that it matches the schema. If the data does not match the schema's data type, it attempts to cast the data to the proper data type. When the data is unable to be casted to the schema's data type, a missing value (None) is inserted in it's place. Defaults to False. |
tc: | TkContext |
Returns | (Frame): | Frame loaded with the specified data |
Create a frame with the specified data.
>>> data = [["Bob", 30, 8], ["Jim", 45, 9.5], ["Sue", 25, 7], ["George", 15, 6], ["Jennifer", 18, 8.5]]
>>> frame = tc.frame.create(data)
Since no schema is provided, the schema will be inferred. Note that the data set had a mix of strings and integers in the third column. The schema will use the most general data type from the data that it sees, so in this example, the column is treated as a float.
>>> frame.schema
[('C0', <type 'str'>), ('C1', <type 'int'>), ('C2', <type 'float'>)]
>>> frame.inspect()
[#] C0 C1 C2
======================
[0] Bob 30 8
[1] Jim 45 9.5
[2] Sue 25 7
[3] George 15 6
[4] Jennifer 18 8.5
We could also enable schema validation, which checks the data against the schema. If the data does not match the schema's data type, it attempts to cast the data to the proper data type.
>>> frame = tc.frame.create(data, validate_schema=True)
In this example with schema validation enabled, the integers in column C2 get casted to floats:
>>> frame.inspect()
[#] C0 C1 C2
======================
[0] Bob 30 8.0
[1] Jim 45 9.5
[2] Sue 25 7.0
[3] George 15 6.0
[4] Jennifer 18 8.5
We could also provide a list of column names when creating the frame. When a list of column names is provided, the data types for the schema are still inferred, but the columns in the schema are labeled with the specified names.
>>> frame = tc.frame.create(data, schema=["name", "age", "shoe_size"], validate_schema=True)
>>> frame.schema
[('name', <type 'str'>), ('age', <type 'int'>), ('shoe_size', <type 'float'>)]
>>> frame.inspect()
[#] name age shoe_size
=============================
[0] Bob 30 8.0
[1] Jim 45 9.5
[2] Sue 25 7.0
[3] George 15 6.0
[4] Jennifer 18 8.5
Note that if a value cannot be parsed as the specified data type in the schema, it will show up as missing (None), if validate_schema is enabled. For example, consider the following frame where columns are defined as integers, but the data specified has a string in the second row.
>>> data = [[1, 2, 3], [4, "five", 6]]
>>> schema = [("a", int), ("b", int), ("c", int)]
>>> frame = tc.frame.create(data, schema, validate_schema = True)
>>> frame.inspect()
[#] a b c
===============
[0] 1 2 3
[1] 4 None 6
Note that the spot where the string was located, has it's value missing (None) since it couldn't be parsed to an integer. If validate_schema was disabled, no attempt is made to parse the data to the data type specified by the schema, and further frame operations may fail due to the data type discrepancy.
def create(data, schema=None, validate_schema=False, tc=TkContext.implicit):
"""
Creates a frame from the given data and schema. If no schema data types are provided, the schema is inferred
based on the data in the first 100 rows.
If schema validation is enabled, all data is is checked to ensure that it matches the schema. If the data does
not match the schema's data type, it attempts to cast the data to the proper data type. When the data is unable
to be casted to the schema's data type, the item will be missing (None) in the frame.
Parameters
----------
:param data: (List of row data or RDD) Data source
:param schema: (Optional(list[tuple(str, type)] or list[str])] Optionally specify a schema (list of tuples of
string column names and data type), column names (list of strings, and the column data types will
be inferred) or None (column data types will be inferred and column names will be numbered like C0,
C1, C2, etc).
:param validate_schema: (Optional(bool)) When True, all data is checked to ensure that it matches the schema.
If the data does not match the schema's data type, it attempts to cast the data to the
proper data type. When the data is unable to be casted to the schema's data type, a
missing value (None) is inserted in it's place. Defaults to False.
:param tc: TkContext
:return: (Frame) Frame loaded with the specified data
Examples
--------
Create a frame with the specified data.
>>> data = [["Bob", 30, 8], ["Jim", 45, 9.5], ["Sue", 25, 7], ["George", 15, 6], ["Jennifer", 18, 8.5]]
>>> frame = tc.frame.create(data)
Since no schema is provided, the schema will be inferred. Note that the data set had a mix of strings and
integers in the third column. The schema will use the most general data type from the data that it sees, so in
this example, the column is treated as a float.
>>> frame.schema
[('C0', ), ('C1', ), ('C2', )]
>>> frame.inspect()
[#] C0 C1 C2
======================
[0] Bob 30 8
[1] Jim 45 9.5
[2] Sue 25 7
[3] George 15 6
[4] Jennifer 18 8.5
We could also enable schema validation, which checks the data against the schema. If the data does not match the
schema's data type, it attempts to cast the data to the proper data type.
>>> frame = tc.frame.create(data, validate_schema=True)
In this example with schema validation enabled, the integers in column C2 get casted to floats:
>>> frame.inspect()
[#] C0 C1 C2
======================
[0] Bob 30 8.0
[1] Jim 45 9.5
[2] Sue 25 7.0
[3] George 15 6.0
[4] Jennifer 18 8.5
We could also provide a list of column names when creating the frame. When a list of column names is provided,
the data types for the schema are still inferred, but the columns in the schema are labeled with the specified names.
>>> frame = tc.frame.create(data, schema=["name", "age", "shoe_size"], validate_schema=True)
>>> frame.schema
[('name', ), ('age', ), ('shoe_size', )]
>>> frame.inspect()
[#] name age shoe_size
=============================
[0] Bob 30 8.0
[1] Jim 45 9.5
[2] Sue 25 7.0
[3] George 15 6.0
[4] Jennifer 18 8.5
Note that if a value cannot be parsed as the specified data type in the schema, it will show up as missing (None),
if validate_schema is enabled. For example, consider the following frame where columns are defined as integers,
but the data specified has a string in the second row.
>>> data = [[1, 2, 3], [4, "five", 6]]
>>> schema = [("a", int), ("b", int), ("c", int)]
>>> frame = tc.frame.create(data, schema, validate_schema = True)
>>> frame.inspect()
[#] a b c
===============
[0] 1 2 3
[1] 4 None 6
Note that the spot where the string was located, has it's value missing (None) since it couldn't be parsed to an
integer. If validate_schema was disabled, no attempt is made to parse the data to the data type specified by the
schema, and further frame operations may fail due to the data type discrepancy.
"""
TkContext.validate(tc)
if data is None:
data = []
if not isinstance(data, list)\
and not isinstance(data, (RDD, DataFrame))\
and not tc._jutils.is_jvm_instance_of(data, tc.sc._jvm.org.apache.spark.rdd.RDD)\
and not tc._jutils.is_jvm_instance_of(data, tc.sc._jvm.org.apache.spark.sql.DataFrame):
raise TypeError("Invalid data source. Expected the data parameter to be a 2-dimensional list (list of row data) or an RDD or DataFrame, but received: %s" % type(data))
from sparktk.frame.frame import Frame
return Frame(tc, data, schema, validate_schema)
def import_csv(
path, delimiter=',', header=False, infer_schema=True, schema=None, datetime_format="yyyy-MM-dd'T'HH:mm:ss.SSSX", tc=<class 'sparktk.arguments.implicit'>)
Creates a frame with data from a csv file.
path | (str): | Full path to the csv file |
delimiter | (Optional[str]): | A string which indicates the separation of data fields. This is usually a single character and could be a non-visible character, such as a tab. The default delimiter is a comma (,). |
header | (Optional[bool]): | Boolean value indicating if the first line of the file will be used to name columns, and not be included in the data. The default value is false. |
:param infer_schema:(Optional[bool]) Boolean value indicating if the column types will be automatically inferred. It requires one extra pass over the data and is false by default.
schema | (Optional[List[tuple(str, type)]]): | Optionally specify the schema for the dataset. Number of columns specified in the schema must match the number of columns in the csv file provided. If the value from the csv file cannot be converted to the data type specified by the schema (for example, if the csv file has a string, and the schema specifies an int), the value will show up as missing (None) in the frame. |
datetime_format | (str): | String specifying how date/time columns are formatted, using the java.text.SimpleDateFormat specified at https://docs.oracle.com/javase/7/docs/api/java/text/SimpleDateFormat.html |
Returns | (Frame): | Frame that contains the data from the csv file |
Load a frame from a csv file by specifying the path to the file, delimiter, and options that specify that there is a header and to infer the schema based on the data.
>>> file_path = "../datasets/cities.csv"
>>> frame = tc.frame.import_csv(file_path, "|", header=True, infer_schema=True)
-etc-
>>> frame.inspect()
[#] rank city population_2013 population_2010 change county
============================================================================
[0] 1 Portland 609456 583776 4.40% Multnomah
[1] 2 Salem 160614 154637 3.87% Marion
[2] 3 Eugene 159190 156185 1.92% Lane
[3] 4 Gresham 109397 105594 3.60% Multnomah
[4] 5 Hillsboro 97368 91611 6.28% Washington
[5] 6 Beaverton 93542 89803 4.16% Washington
[6] 15 Grants Pass 35076 34533 1.57% Josephine
[7] 16 Oregon City 34622 31859 8.67% Clackamas
[8] 17 McMinnville 33131 32187 2.93% Yamhill
[9] 18 Redmond 27427 26215 4.62% Deschutes
>>> frame.schema
[('rank', <type 'int'>), ('city', <type 'str'>), ('population_2013', <type 'int'>), ('population_2010', <type 'int'>), ('change', <type 'str'>), ('county', <type 'str'>)]
def import_csv(path, delimiter=",", header=False, infer_schema=True, schema=None, datetime_format="yyyy-MM-dd'T'HH:mm:ss.SSSX", tc=TkContext.implicit):
"""
Creates a frame with data from a csv file.
Parameters
----------
:param path: (str) Full path to the csv file
:param delimiter: (Optional[str]) A string which indicates the separation of data fields. This is usually a
single character and could be a non-visible character, such as a tab. The default delimiter
is a comma (,).
:param header: (Optional[bool]) Boolean value indicating if the first line of the file will be used to name columns,
and not be included in the data. The default value is false.
:param infer_schema:(Optional[bool]) Boolean value indicating if the column types will be automatically inferred.
It requires one extra pass over the data and is false by default.
:param schema: (Optional[List[tuple(str, type)]]) Optionally specify the schema for the dataset. Number of
columns specified in the schema must match the number of columns in the csv file provided. If the
value from the csv file cannot be converted to the data type specified by the schema (for example,
if the csv file has a string, and the schema specifies an int), the value will show up as missing
(None) in the frame.
:param datetime_format: (str) String specifying how date/time columns are formatted, using the java.text.SimpleDateFormat
specified at https://docs.oracle.com/javase/7/docs/api/java/text/SimpleDateFormat.html
:return: (Frame) Frame that contains the data from the csv file
Examples
--------
Load a frame from a csv file by specifying the path to the file, delimiter, and options that specify that
there is a header and to infer the schema based on the data.
>>> file_path = "../datasets/cities.csv"
>>> frame = tc.frame.import_csv(file_path, "|", header=True, infer_schema=True)
-etc-
>>> frame.inspect()
[#] rank city population_2013 population_2010 change county
============================================================================
[0] 1 Portland 609456 583776 4.40% Multnomah
[1] 2 Salem 160614 154637 3.87% Marion
[2] 3 Eugene 159190 156185 1.92% Lane
[3] 4 Gresham 109397 105594 3.60% Multnomah
[4] 5 Hillsboro 97368 91611 6.28% Washington
[5] 6 Beaverton 93542 89803 4.16% Washington
[6] 15 Grants Pass 35076 34533 1.57% Josephine
[7] 16 Oregon City 34622 31859 8.67% Clackamas
[8] 17 McMinnville 33131 32187 2.93% Yamhill
[9] 18 Redmond 27427 26215 4.62% Deschutes
>>> frame.schema
[('rank', ), ('city', ), ('population_2013', ), ('population_2010', ), ('change', ), ('county', )]
"""
if schema is not None:
infer_schema = False # if a custom schema is provided, don't waste time inferring the schema during load
sparktk_schema.validate(schema)
if not isinstance(header, bool):
raise ValueError("header parameter must be a boolean, but is {0}.".format(type(header)))
if not isinstance(infer_schema, bool):
raise ValueError("infer_schema parameter must be a boolean, but is {0}.".format(type(infer_schema)))
TkContext.validate(tc)
header_str = str(header).lower()
infer_schema_str = str(infer_schema).lower()
pyspark_schema = None
if (not infer_schema) and (schema is not None):
fields = []
for column in schema:
if dtypes._data_type_to_pyspark_type_table.has_key(column[1]):
fields.append(StructField(column[0], dtypes._data_type_to_pyspark_type_table[column[1]], True))
else:
raise TypeError("Unsupported type {0} in schema for column {1}.".format(column[1], column[0]))
pyspark_schema = StructType(fields)
df = tc.sql_context.read.format(
"com.databricks.spark.csv.org.trustedanalytics.sparktk").options(
delimiter=delimiter,
header=header_str,
dateformat=datetime_format,
inferschema=infer_schema_str).load(path, schema=pyspark_schema)
df_schema = []
if schema is None:
for column in df.schema.fields:
try:
datatype = dtypes.dtypes.get_primitive_type_from_pyspark_type(type(column.dataType))
except ValueError:
raise TypeError("Unsupported data type ({0}) for column {1}.".format(str(column.dataType), column.name))
df_schema.append((column.name, datatype))
else:
df_column_count = len(df.schema.fields)
custom_column_count = len(schema)
if (df_column_count != custom_column_count):
raise ValueError("Bad schema value. The number of columns in the custom schema ({0}) must match the"
"number of columns in the csv file data ({1}).".format(custom_column_count, df_column_count))
df_schema = schema
def cast_datetime(row):
"""
The spark data frame gives uses datetime objects. Convert them to long (ms since epoch) for our frame.
"""
data = []
for column_index in xrange(0, len(df_schema)):
if df_schema[column_index][1] == dtypes.datetime and isinstance(row[column_index], datetime):
data.append(long(dtypes.datetime_to_ms(row[column_index])))
else:
data.append(row[column_index])
return data
jrdd = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.scalaToPython(df._jdf.rdd())
rdd = RDD(jrdd, tc.sc)
if any(c[1] == dtypes.datetime for c in df_schema):
# If any columns are date/time we must do this map
rdd = df.rdd.map(cast_datetime)
from sparktk.frame.frame import Frame # circular dependency, so import late
return Frame(tc, rdd, df_schema)
def import_hbase(
table_name, schema, start_tag=None, end_tag=None, tc=<class 'sparktk.arguments.implicit'>)
Import data from hbase table into frame
table_name | (str): | hbase table name |
schema | (list[list[str, str, type]]): | hbase schema as a List of List(string) (columnFamily, columnName, dataType for cell value) |
start_tag | (Optional(str)): | optional start tag for filtering |
end_tag | (Optional(str)): | optional end tag for filtering |
Returns | (Frame): | frame with data from hbase table |
Load data into frame from a hbase table
>>> frame = tc.frame.import_hbase("demo_test_hbase", [["test_family", "a", int],["test_family", "b", float], ["test_family", "c", int],["test_family", "d", int]])
-etc-
>>> frame.inspect()
[#] test_family_a test_family_b test_family_c test_family_d
===============================================================
[0] 1 0.2 -2 5
[1] 2 0.4 -1 6
[2] 3 0.6 0 7
[3] 4 0.8 1 8
Use of start_tag and end_tag. (Hbase creates a unique row id for data in hbase tables)
start_tag: It is the unique row id from where row scan should start
end_tag: It is the unique row id where row scan should end
Assuming you already have data on hbase table "test_startendtag" under "startendtag" family name with single column named "number".
data: column contains values from 1 to 99. Here rowid is generated by hbase.
Sample hbase data. Few rows from hbase table looks as below.
hbase(main):002:0> scan "test_startendtag"
ROW COLUMN+CELL
0 column=startendtag:number, timestamp=1465342524846, value=1
1 column=startendtag:number, timestamp=1465342524846, value=25
10 column=startendtag:number, timestamp=1465342524847, value=51
103 column=startendtag:number, timestamp=1465342524851, value=98
107 column=startendtag:number, timestamp=1465342524851, value=99
11 column=startendtag:number, timestamp=1465342524851, value=75
12 column=startendtag:number, timestamp=1465342524846, value=4
13 column=startendtag:number, timestamp=1465342524846, value=28
14 column=startendtag:number, timestamp=1465342524847, value=52
15 column=startendtag:number, timestamp=1465342524851, value=76
16 column=startendtag:number, timestamp=1465342524846, value=5
17 column=startendtag:number, timestamp=1465342524846, value=29
18 column=startendtag:number, timestamp=1465342524847, value=53
19 column=startendtag:number, timestamp=1465342524851, value=77
2 column=startendtag:number, timestamp=1465342524847, value=49
20 column=startendtag:number, timestamp=1465342524846, value=6
21 column=startendtag:number, timestamp=1465342524846, value=30
>>> frame = tc.frame.import_hbase("test_startendtag", [["startendtag", "number", int]], start_tag="20", end_tag="50")
-etc-
>>> frame.count()
33
>>> frame.inspect(33)
[##] startendtag_number
========================
[0] 6
[1] 30
[2] 54
[3] 78
[4] 7
[5] 31
[6] 55
[7] 79
[8] 8
[9] 32
[10] 73
[11] 56
[12] 80
[13] 9
[14] 33
[15] 57
[16] 81
[17] 10
[18] 34
[19] 58
[##] startendtag_number
========================
[20] 82
[21] 2
[22] 11
[23] 35
[24] 59
[25] 83
[26] 12
[27] 36
[28] 60
[29] 84
[30] 13
[31] 37
[32] 26
def import_hbase(table_name, schema, start_tag=None, end_tag=None, tc=TkContext.implicit):
"""
Import data from hbase table into frame
Parameters
----------
:param table_name: (str) hbase table name
:param schema: (list[list[str, str, type]]) hbase schema as a List of List(string) (columnFamily, columnName,
dataType for cell value)
:param start_tag: (Optional(str)) optional start tag for filtering
:param end_tag: (Optional(str)) optional end tag for filtering
:return: (Frame) frame with data from hbase table
Example
---------
Load data into frame from a hbase table
>>> frame = tc.frame.import_hbase("demo_test_hbase", [["test_family", "a", int],["test_family", "b", float], ["test_family", "c", int],["test_family", "d", int]])
-etc-
>>> frame.inspect()
[#] test_family_a test_family_b test_family_c test_family_d
===============================================================
[0] 1 0.2 -2 5
[1] 2 0.4 -1 6
[2] 3 0.6 0 7
[3] 4 0.8 1 8
Use of start_tag and end_tag. (Hbase creates a unique row id for data in hbase tables)
start_tag: It is the unique row id from where row scan should start
end_tag: It is the unique row id where row scan should end
Assuming you already have data on hbase table "test_startendtag" under "startendtag" family name with single column named "number".
data: column contains values from 1 to 99. Here rowid is generated by hbase.
Sample hbase data. Few rows from hbase table looks as below.
hbase(main):002:0> scan "test_startendtag"
ROW COLUMN+CELL
0 column=startendtag:number, timestamp=1465342524846, value=1
1 column=startendtag:number, timestamp=1465342524846, value=25
10 column=startendtag:number, timestamp=1465342524847, value=51
103 column=startendtag:number, timestamp=1465342524851, value=98
107 column=startendtag:number, timestamp=1465342524851, value=99
11 column=startendtag:number, timestamp=1465342524851, value=75
12 column=startendtag:number, timestamp=1465342524846, value=4
13 column=startendtag:number, timestamp=1465342524846, value=28
14 column=startendtag:number, timestamp=1465342524847, value=52
15 column=startendtag:number, timestamp=1465342524851, value=76
16 column=startendtag:number, timestamp=1465342524846, value=5
17 column=startendtag:number, timestamp=1465342524846, value=29
18 column=startendtag:number, timestamp=1465342524847, value=53
19 column=startendtag:number, timestamp=1465342524851, value=77
2 column=startendtag:number, timestamp=1465342524847, value=49
20 column=startendtag:number, timestamp=1465342524846, value=6
21 column=startendtag:number, timestamp=1465342524846, value=30
>>> frame = tc.frame.import_hbase("test_startendtag", [["startendtag", "number", int]], start_tag="20", end_tag="50")
-etc-
>>> frame.count()
33
>>> frame.inspect(33)
[##] startendtag_number
========================
[0] 6
[1] 30
[2] 54
[3] 78
[4] 7
[5] 31
[6] 55
[7] 79
[8] 8
[9] 32
[10] 73
[11] 56
[12] 80
[13] 9
[14] 33
[15] 57
[16] 81
[17] 10
[18] 34
[19] 58
[##] startendtag_number
========================
[20] 82
[21] 2
[22] 11
[23] 35
[24] 59
[25] 83
[26] 12
[27] 36
[28] 60
[29] 84
[30] 13
[31] 37
[32] 26
"""
if not isinstance(table_name, basestring):
raise ValueError("table name parameter must be a string, but is {0}.".format(type(table_name)))
if not isinstance(schema, list):
raise ValueError("schema parameter must be a list, but is {0}.".format(type(table_name)))
TkContext.validate(tc)
inner_lists=[tc._jutils.convert.to_scala_list([item[0], item[1], dtypes.to_string(item[2])]) for item in schema]
scala_final_schema = tc.jutils.convert.to_scala_list(inner_lists)
scala_frame = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.constructors.Import.importHbase(tc.jutils.get_scala_sc(),
table_name, scala_final_schema,
tc._jutils.convert.to_scala_option(start_tag),
tc._jutils.convert.to_scala_option(end_tag))
from sparktk.frame.frame import Frame
return Frame(tc, scala_frame)
def import_hive(
hive_query, tc=<class 'sparktk.arguments.implicit'>)
Import data from hive table into frame.
Define the sql query to retrieve the data from a hive table.
Only a subset of Hive data types are supported:
DataType Support
---------- ------------------------------------
boolean cast to int
bigint native support
int native support
tinyint cast to int
smallint cast to int
decimal cast to double, may lose precision
double native support
float native support
date cast to string
string native support
timestamp cast to string
varchar cast to string
arrays not supported
binary not supported
char not supported
maps not supported
structs not supported
union not supported
hive_query | (str): | hive query to fetch data from table |
tc | (TkContext): | TK context |
Returns | (Frame): | returns frame with hive table data |
Load data into frame from a hive table based on hive query
>>> h_query = "select * from demo_test"
>>> frame = tc.frame.import_hive(h_query)
-etc-
>>> frame.inspect()
[#] number strformat
======================
[0] 1 one
[1] 2 two
[2] 3 three
[3] 4 four
def import_hive(hive_query, tc=TkContext.implicit):
"""
Import data from hive table into frame.
Define the sql query to retrieve the data from a hive table.
Only a subset of Hive data types are supported:
DataType Support
---------- ------------------------------------
boolean cast to int
bigint native support
int native support
tinyint cast to int
smallint cast to int
decimal cast to double, may lose precision
double native support
float native support
date cast to string
string native support
timestamp cast to string
varchar cast to string
arrays not supported
binary not supported
char not supported
maps not supported
structs not supported
union not supported
Parameters
----------
:param hive_query: (str) hive query to fetch data from table
:param tc: (TkContext) TK context
:return: (Frame) returns frame with hive table data
Examples
--------
Load data into frame from a hive table based on hive query
>>> h_query = "select * from demo_test"
>>> frame = tc.frame.import_hive(h_query)
-etc-
>>> frame.inspect()
[#] number strformat
======================
[0] 1 one
[1] 2 two
[2] 3 three
[3] 4 four
"""
if not isinstance(hive_query, basestring):
raise ValueError("hive query parameter must be a string, but is {0}.".format(type(hive_query)))
TkContext.validate(tc)
scala_frame = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.constructors.Import.importHive(tc.jutils.get_scala_sc(), hive_query)
from sparktk.frame.frame import Frame
return Frame(tc, scala_frame)
def import_jdbc(
connection_url, table_name, tc=<class 'sparktk.arguments.implicit'>)
Import data from jdbc table into frame.
connection_url | (str): | JDBC connection url to database server |
table_name | (str): | JDBC table name |
Returns | (Frame): | returns frame with jdbc table data |
Load a frame from a jdbc table specifying the connection url to the database server.
>>> url = "jdbc:postgresql://localhost/postgres"
>>> tb_name = "demo_test"
>>> frame = tc.frame.import_jdbc(url, tb_name)
-etc-
>>> frame.inspect()
[#] a b c d
==================
[0] 1 0.2 -2 5
[1] 2 0.4 -1 6
[2] 3 0.6 0 7
[3] 4 0.8 1 8
>>> frame.schema
[(u'a', int), (u'b', float), (u'c', int), (u'd', int)]
def import_jdbc(connection_url, table_name, tc=TkContext.implicit):
"""
Import data from jdbc table into frame.
Parameters
----------
:param connection_url: (str) JDBC connection url to database server
:param table_name: (str) JDBC table name
:return: (Frame) returns frame with jdbc table data
Examples
--------
Load a frame from a jdbc table specifying the connection url to the database server.
>>> url = "jdbc:postgresql://localhost/postgres"
>>> tb_name = "demo_test"
>>> frame = tc.frame.import_jdbc(url, tb_name)
-etc-
>>> frame.inspect()
[#] a b c d
==================
[0] 1 0.2 -2 5
[1] 2 0.4 -1 6
[2] 3 0.6 0 7
[3] 4 0.8 1 8
>>> frame.schema
[(u'a', int), (u'b', float), (u'c', int), (u'd', int)]
"""
if not isinstance(connection_url, basestring):
raise ValueError("connection url parameter must be a string, but is {0}.".format(type(connection_url)))
if not isinstance(table_name, basestring):
raise ValueError("table name parameter must be a string, but is {0}.".format(type(table_name)))
TkContext.validate(tc)
scala_frame = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.constructors.Import.importJdbc(tc.jutils.get_scala_sc(), connection_url, table_name)
from sparktk.frame.frame import Frame
return Frame(tc, scala_frame)
def import_pandas(
pandas_frame, schema=None, row_index=True, validate_schema=False, tc=<class 'sparktk.arguments.implicit'>)
Imports data from the specified pandas data frame.
pandas_frame | (pandas.DataFrame): | pandas dataframe object |
schema | (Optional(list[tuples(string, type)])): | Schema description of the fields for a given line. It is a list of tuples which describe each field, (field name, field type), where the field name is a string, and file is a supported type. If no schema is provided, the schema will be inferred based on the column names and types from the pandas_frame. |
row_index | (Optional(bool)): | Indicates if the row_index is present in the pandas dataframe and needs to be ignored when looking at the data values. Default value is True. |
validate_schema | (Optional(bool)): | If true, validates the data against the schema and attempts to cast the data to the specified type, if it does not match the schema. Defaults to False. |
Returns | (Frame): | spark-tk frame that contains data from the pandas_frame |
Create a pandas data frame:
>>> import pandas
>>> ratings_data = [[0, "invalid"], [1, "Very Poor"], [2, "Poor"], [3, "Average"], [4, "Good"], [5, "Very Good"]]
>>> df = pandas.DataFrame(ratings_data, columns=['rating_id', 'rating_text'])
>>> df
rating_id rating_text
0 0 invalid
1 1 Very Poor
2 2 Poor
3 3 Average
4 4 Good
5 5 Very Good
>>> df.columns.tolist()
['rating_id', 'rating_text']
>>> df.dtypes
rating_id int64
rating_text object
dtype: object
When using import_pandas by just passing the pandas data frame, it will use the column names and types from the pandas data frame to generate the schema.
>>> frame = tc.frame.import_pandas(df)
>>> frame.inspect()
[#] rating_id rating_text
===========================
[0] 0 invalid
[1] 1 Very Poor
[2] 2 Poor
[3] 3 Average
[4] 4 Good
[5] 5 Very Good
>>> frame.schema
[('rating_id', <type 'long'>), ('rating_text', <type 'str'>)]
Alternatively, you can specify a schema when importing the pandas data frame. There is also the option to validate the data against the schema. If this option is enabled, we will attempt to cast the data to the column's data type, if it does not match the schema.
For example, here we will specify a schema where the rating_id column will instead be called 'rating_float' and it's data type will be a float. We will also enable the validate_schema option so that the rating_id value will get casted to a float: >>> schema = [("rating_float", float), ("rating_str", unicode)] >>> frame = tc.frame.import_pandas(df, schema, validate_schema=True)
>>> frame.inspect()
[#] rating_float rating_str
=============================
[0] 0.0 invalid
[1] 1.0 Very Poor
[2] 2.0 Poor
[3] 3.0 Average
[4] 4.0 Good
[5] 5.0 Very Good
>>> frame.schema
[('rating_float', <type 'float'>), ('rating_str', <type 'unicode'>)]
def import_pandas(pandas_frame, schema=None, row_index=True, validate_schema=False, tc=TkContext.implicit):
"""
Imports data from the specified pandas data frame.
Parameters
----------
:param pandas_frame: (pandas.DataFrame) pandas dataframe object
:param schema: (Optional(list[tuples(string, type)])) Schema description of the fields for a given line. It is a
list of tuples which describe each field, (field name, field type), where the field name is a
string, and file is a supported type. If no schema is provided, the schema will be inferred based
on the column names and types from the pandas_frame.
:param row_index: (Optional(bool)) Indicates if the row_index is present in the pandas dataframe and needs to be
ignored when looking at the data values. Default value is True.
:param validate_schema: (Optional(bool)) If true, validates the data against the schema and attempts to cast the
data to the specified type, if it does not match the schema. Defaults to False.
:return: (Frame) spark-tk frame that contains data from the pandas_frame
Examples
--------
Create a pandas data frame:
>>> import pandas
>>> ratings_data = [[0, "invalid"], [1, "Very Poor"], [2, "Poor"], [3, "Average"], [4, "Good"], [5, "Very Good"]]
>>> df = pandas.DataFrame(ratings_data, columns=['rating_id', 'rating_text'])
>>> df
rating_id rating_text
0 0 invalid
1 1 Very Poor
2 2 Poor
3 3 Average
4 4 Good
5 5 Very Good
>>> df.columns.tolist()
['rating_id', 'rating_text']
>>> df.dtypes
rating_id int64
rating_text object
dtype: object
When using import_pandas by just passing the pandas data frame, it will use the column names and types from the
pandas data frame to generate the schema.
>>> frame = tc.frame.import_pandas(df)
>>> frame.inspect()
[#] rating_id rating_text
===========================
[0] 0 invalid
[1] 1 Very Poor
[2] 2 Poor
[3] 3 Average
[4] 4 Good
[5] 5 Very Good
>>> frame.schema
[('rating_id', ), ('rating_text', )]
Alternatively, you can specify a schema when importing the pandas data frame. There is also the option to validate
the data against the schema. If this option is enabled, we will attempt to cast the data to the column's data type,
if it does not match the schema.
For example, here we will specify a schema where the rating_id column will instead be called 'rating_float' and it's
data type will be a float. We will also enable the validate_schema option so that the rating_id value will get
casted to a float:
>>> schema = [("rating_float", float), ("rating_str", unicode)]
>>> frame = tc.frame.import_pandas(df, schema, validate_schema=True)
>>> frame.inspect()
[#] rating_float rating_str
=============================
[0] 0.0 invalid
[1] 1.0 Very Poor
[2] 2.0 Poor
[3] 3.0 Average
[4] 4.0 Good
[5] 5.0 Very Good
>>> frame.schema
[('rating_float', ), ('rating_str', )]
"""
try:
import pandas
except:
raise RuntimeError("pandas module not found, unable to download. Install pandas or try the take command.")
if not isinstance(pandas_frame, pandas.DataFrame):
raise TypeError("data_frame must be a pandas DataFrame.")
TkContext.validate(tc)
if schema is not None:
schema = _validate(schema)
else:
schema = _get_schema_from_df(pandas_frame)
if not row_index:
pandas_frame = pandas_frame.reset_index()
pandas_frame = pandas_frame.dropna(thresh=len(pandas_frame.columns))
field_names = [x[0] for x in schema]
if len(pandas_frame.columns) != len(field_names):
raise ValueError("Number of columns in Pandasframe {0} does not match the number of columns in the"
" schema provided {1}.".format(len(pandas_frame.columns), len(field_names)))
date_time_columns = [i for i, x in enumerate(pandas_frame.dtypes) if x == "datetime64[ns]"]
has_date_time = len(date_time_columns) > 0
# pandas gives us the date/time in nm or as a Timestamp, and spark-tk expects it as ms, so we need to do the conversion
def pandas_datetime_to_ms(row):
for i in date_time_columns:
if isinstance(row[i], long):
row[i] = row[i] / 1000000
elif isinstance(row[i], pandas.tslib.Timestamp) or isinstance(row[i], datetime):
dt = row[i]
# get number of seconds since epoch (%s) and multiply by 1000 for ms then get the
# microseconds to get the ms precision.
row[i] = long((long(dt.strftime("%s")) * 1000) + (dt.microsecond // 1000))
return row
pandas_rows = pandas_frame[0:len(pandas_frame.index)].values.tolist()
# if the dataframe has date/time columns, map them to ms
if (has_date_time):
pandas_rows = map(pandas_datetime_to_ms, pandas_rows)
# create frame with the pandas_rows
frame = tc.frame.create(pandas_rows, schema)
if validate_schema:
frame = tc.frame.create(frame.rdd, schema, validate_schema)
return frame
def load(
path, tc=<class 'sparktk.arguments.implicit'>)
load Frame from given path
def load(path, tc=TkContext.implicit):
"""load Frame from given path"""
TkContext.validate(tc)
return tc.load(path, Frame)
Classes
class Frame
class Frame(object):
def __init__(self, tc, source, schema=None, validate_schema=False):
"""(Private constructor -- use tc.frame.create or other methods available from the TkContext)"""
self._tc = tc
if self._is_scala_frame(source):
self._frame = source
elif self._is_scala_rdd(source):
scala_schema = schema_to_scala(tc.sc, schema)
self._frame = self._create_scala_frame(tc.sc, source, scala_schema)
elif self._is_scala_dataframe(source):
self._frame = self._create_scala_frame_from_scala_dataframe(tc.sc, source)
elif isinstance(source, DataFrame):
self._frame = self._create_scala_frame_from_scala_dataframe(tc.sc, source._jdf)
elif isinstance(source, PythonFrame):
self._frame = source
else:
if not isinstance(source, RDD):
if not isinstance(source, list) or (len(source) > 0 and any(not isinstance(row, (list, tuple)) for row in source)):
raise TypeError("Invalid data source. The data parameter must be a 2-dimensional list (list of row data) or an RDD.")
inferred_schema = False
if isinstance(schema, list):
if all(isinstance(item, basestring) for item in schema):
# check if schema is just a list of column names (versus string and data type tuples)
schema = self._infer_schema(source, schema)
inferred_schema = True
elif not all(isinstance(item, tuple) and
len(item) == 2 and
isinstance(item[0], basestring) for item in schema):
raise TypeError("Invalid schema. Expected a list of tuples (str, type) with the column name and data type, but received type %s." % type(schema))
# check for duplicate column names
column_names = [col[0] for col in schema]
duplicate_column_names = set([col for col in column_names if column_names.count(col) > 1])
if len(duplicate_column_names) > 0:
raise ValueError("Invalid schema, column names cannot be duplicated: %s" % ", ".join(duplicate_column_names))
elif schema is None:
schema = self._infer_schema(source)
inferred_schema = True
else:
# Schema is not a list or None
raise TypeError("Invalid schema type: %s. Expected a list of tuples (str, type) with the column name and data type." % type(schema))
for item in schema:
if not self._is_supported_datatype(item[1]):
if inferred_schema:
raise TypeError("The %s data type was found when inferring the schema, and it is not a "
"supported data type. Instead, specify a schema that uses a supported data "
"type, and enable validate_schema so that the data is converted to the proper "
"data type.\n\nInferred schema: %s\n\nSupported data types: %s" %
(str(item[1]), str(schema), dtypes.dtypes))
else:
raise TypeError("Invalid schema. %s is not a supported data type.\n\nSupported data types: %s" %
(str(item[1]), dtypes.dtypes))
source = tc.sc.parallelize(source)
if schema and validate_schema:
# Validate schema by going through the data and checking the data type and attempting to parse it
validate_schema_result = self.validate_pyrdd_schema(source, schema)
source = validate_schema_result.validated_rdd
logger.debug("%s values were unable to be parsed to the schema's data type." % validate_schema_result.bad_value_count)
# If schema contains matrix datatype, then apply type_coercer to convert list[list] to numpy ndarray
map_source = MatrixCoercion.schema_is_coercible(source, list(schema))
self._frame = PythonFrame(map_source, schema)
def _merge_types(self, type_list_a, type_list_b):
"""
Merges two lists of data types
:param type_list_a: First list of data types to merge
:param type_list_b: Second list of data types to merge
:return: List of merged data types
"""
if not isinstance(type_list_a, list) or not isinstance(type_list_b, list):
raise TypeError("Unable to generate schema, because schema is not a list.")
if len(type_list_a) != len(type_list_b):
raise ValueError("Length of each row must be the same (found rows with lengths: %s and %s)." % (len(type_list_a), len(type_list_b)))
return [dtypes._DataTypes.merge_types(type_list_a[i], type_list_b[i]) for i in xrange(0, len(type_list_a))]
def _infer_types_for_row(self, row):
"""
Returns a list of data types for the data in the specified row
:param row: List or Row of data
:return: List of data types
"""
inferred_types = []
for item in row:
if item is None:
inferred_types.append(int)
elif not isinstance(item, list):
inferred_types.append(type(item))
else:
inferred_types.append(dtypes.vector((len(item))))
return inferred_types
def _infer_schema(self, data, column_names=[], sample_size=100):
"""
Infers the schema based on the data in the RDD.
:param sc: Spark Context
:param data: Data used to infer schema
:param column_names: Optional column names to use in the schema. If no column names are provided, columns
are given numbered names. If there are more columns in the RDD than there are in the
column_names list, remaining columns will be numbered.
:param sample_size: Number of rows to check when inferring the schema. Defaults to 100.
:return: Schema
"""
inferred_schema = []
if isinstance(data, list):
if len(data) > 0:
# get the schema for the first row
data_types = self._infer_types_for_row(data[0])
sample_size = min(sample_size, len(data))
for i in xrange (1, sample_size):
data_types = self._merge_types(data_types, self._infer_types_for_row(data[i]))
for i, data_type in enumerate(data_types):
column_name = "C%s" % i
if len(column_names) > i:
column_name = column_names[i]
inferred_schema.append((column_name, data_type))
else:
raise TypeError("Unable to infer schema, because the data provided is not a list.")
return inferred_schema
def _is_supported_datatype(self, data_type):
"""
Returns True if the specified data_type is supported.
"""
supported_primitives = [int, float, long, str, unicode]
if data_type in supported_primitives:
return True
elif data_type is dtypes.datetime:
return True
elif type(data_type) is dtypes.vector:
return True
elif data_type is dtypes.matrix:
return True
else:
return False
def validate_pyrdd_schema(self, pyrdd, schema):
if isinstance(pyrdd, RDD):
schema_length = len(schema)
num_bad_values = self._tc.sc.accumulator(0)
def validate_schema(row, accumulator):
data = []
if len(row) != schema_length:
raise ValueError("Length of the row (%s) does not match the schema length (%s)." % (len(row), len(schema)))
for index, column in enumerate(schema):
data_type = column[1]
try:
if row[index] is not None:
data.append(dtypes.dtypes.cast(row[index], data_type))
except:
data.append(None)
accumulator += 1
return data
validated_rdd = pyrdd.map(lambda row: validate_schema(row, num_bad_values))
# Force rdd to load, so that we can get a bad value count
validated_rdd.count()
return SchemaValidationReturn(validated_rdd, num_bad_values.value)
else:
raise TypeError("Unable to validate schema, because the pyrdd provided is not an RDD.")
@staticmethod
def _create_scala_frame(sc, scala_rdd, scala_schema):
"""call constructor in JVM"""
return sc._jvm.org.trustedanalytics.sparktk.frame.Frame(scala_rdd, scala_schema, False)
@staticmethod
def _create_scala_frame_from_scala_dataframe(sc, scala_dataframe):
"""call constructor in JVM"""
return sc._jvm.org.trustedanalytics.sparktk.frame.Frame(scala_dataframe)
@staticmethod
def _from_scala(tc, scala_frame):
"""creates a python Frame for the given scala Frame"""
return Frame(tc, scala_frame)
def _frame_to_scala(self, python_frame):
"""converts a PythonFrame to a Scala Frame"""
scala_schema = schema_to_scala(self._tc.sc, python_frame.schema)
scala_rdd = self._tc.sc._jvm.org.trustedanalytics.sparktk.frame.rdd.PythonJavaRdd.pythonToScala(python_frame.rdd._jrdd, scala_schema)
return self._create_scala_frame(self._tc.sc, scala_rdd, scala_schema)
def _is_scala_frame(self, item):
return self._tc._jutils.is_jvm_instance_of(item, self._tc.sc._jvm.org.trustedanalytics.sparktk.frame.Frame)
def _is_scala_rdd(self, item):
return self._tc._jutils.is_jvm_instance_of(item, self._tc.sc._jvm.org.apache.spark.rdd.RDD)
def _is_scala_dataframe(self, item):
return self._tc._jutils.is_jvm_instance_of(item, self._tc.sc._jvm.org.apache.spark.sql.DataFrame)
def _is_python_rdd(self, item):
return isinstance(item, RDD)
@property
def _is_scala(self):
"""answers whether the current frame is backed by a Scala Frame"""
return self._is_scala_frame(self._frame)
@property
def _is_python(self):
"""answers whether the current frame is backed by a _PythonFrame"""
return not self._is_scala
@property
def _scala(self):
"""gets frame backend as Scala Frame, causes conversion if it is current not"""
if self._is_python:
# If schema contains matrix dataype,
# then apply type_coercer_pymlib to convert ndarray to pymlib DenseMatrix for serialization purpose at java
self._frame.rdd = MatrixCoercion.schema_is_coercible(self._frame.rdd, list(self._frame.schema), True)
# convert PythonFrame to a Scala Frame"""
scala_schema = schema_to_scala(self._tc.sc, self._frame.schema)
scala_rdd = self._tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.pythonToScala(self._frame.rdd._jrdd, scala_schema)
self._frame = self._create_scala_frame(self._tc.sc, scala_rdd, scala_schema)
return self._frame
@property
def _python(self):
"""gets frame backend as _PythonFrame, causes conversion if it is current not"""
if self._is_scala:
# convert Scala Frame to a PythonFrame"""
scala_schema = self._frame.schema()
java_rdd = self._tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.scalaToPython(self._frame.rdd())
python_schema = schema_to_python(self._tc.sc, scala_schema)
python_rdd = RDD(java_rdd, self._tc.sc)
# If schema contains matrix datatype, then apply type_coercer to convert list[list] to numpy ndarray
map_python_rdd = MatrixCoercion.schema_is_coercible(python_rdd, list(python_schema))
self._frame = PythonFrame(map_python_rdd, python_schema)
return self._frame
##########################################################################
# API
##########################################################################
@property
def rdd(self):
"""pyspark RDD (causes conversion if currently backed by a Scala RDD)"""
return self._python.rdd
@property
def dataframe(self):
"""pyspark DataFrame (causes conversion through Scala)"""
return DataFrame(self._scala.dataframe(), self._tc.sql_context)
@property
def schema(self):
if self._is_scala:
return schema_to_python(self._tc.sc, self._frame.schema()) # need ()'s on schema because it's a def in scala
return self._frame.schema
@property
def column_names(self):
"""
Column identifications in the current frame.
:return: list of names of all the frame's columns
Returns the names of the columns of the current frame.
Examples
--------
>>> frame.column_names
[u'name', u'age', u'tenure', u'phone']
"""
return [name for name, data_type in self.schema]
# Frame Operations
from sparktk.frame.ops.add_columns import add_columns
from sparktk.frame.ops.append import append
from sparktk.frame.ops.assign_sample import assign_sample
from sparktk.frame.ops.bin_column import bin_column
from sparktk.frame.ops.binary_classification_metrics import binary_classification_metrics
from sparktk.frame.ops.box_cox import box_cox
from sparktk.frame.ops.categorical_summary import categorical_summary
from sparktk.frame.ops.collect import collect
from sparktk.frame.ops.column_median import column_median
from sparktk.frame.ops.column_mode import column_mode
from sparktk.frame.ops.column_summary_statistics import column_summary_statistics
from sparktk.frame.ops.copy import copy
from sparktk.frame.ops.correlation import correlation
from sparktk.frame.ops.correlation_matrix import correlation_matrix
from sparktk.frame.ops.count import count
from sparktk.frame.ops.covariance import covariance
from sparktk.frame.ops.covariance_matrix import covariance_matrix
from sparktk.frame.ops.cumulative_percent import cumulative_percent
from sparktk.frame.ops.cumulative_sum import cumulative_sum
from sparktk.frame.ops.dot_product import dot_product
from sparktk.frame.ops.drop_columns import drop_columns
from sparktk.frame.ops.drop_duplicates import drop_duplicates
from sparktk.frame.ops.drop_rows import drop_rows
from sparktk.frame.ops.ecdf import ecdf
from sparktk.frame.ops.entropy import entropy
from sparktk.frame.ops.export_data import export_to_csv, export_to_jdbc, export_to_json, export_to_hbase, export_to_hive
from sparktk.frame.ops.filter import filter
from sparktk.frame.ops.flatten_columns import flatten_columns
from sparktk.frame.ops.group_by import group_by
from sparktk.frame.ops.histogram import histogram
from sparktk.frame.ops.inspect import inspect
from sparktk.frame.ops.join_inner import join_inner
from sparktk.frame.ops.join_left import join_left
from sparktk.frame.ops.join_right import join_right
from sparktk.frame.ops.join_outer import join_outer
from sparktk.frame.ops.map_columns import map_columns
from sparktk.frame.ops.matrix_covariance_matrix import matrix_covariance_matrix
from sparktk.frame.ops.matrix_pca import matrix_pca
from sparktk.frame.ops.matrix_svd import matrix_svd
from sparktk.frame.ops.multiclass_classification_metrics import multiclass_classification_metrics
from sparktk.frame.ops.power_iteration_clustering import power_iteration_clustering
from sparktk.frame.ops.quantile_bin_column import quantile_bin_column
from sparktk.frame.ops.quantiles import quantiles
from sparktk.frame.ops.rename_columns import rename_columns
from sparktk.frame.ops.reverse_box_cox import reverse_box_cox
from sparktk.frame.ops.save import save
from sparktk.frame.ops.sort import sort
from sparktk.frame.ops.sortedk import sorted_k
from sparktk.frame.ops.take import take
from sparktk.frame.ops.tally import tally
from sparktk.frame.ops.tally_percent import tally_percent
from sparktk.frame.ops.timeseries_augmented_dickey_fuller_test import timeseries_augmented_dickey_fuller_test
from sparktk.frame.ops.timeseries_breusch_godfrey_test import timeseries_breusch_godfrey_test
from sparktk.frame.ops.timeseries_breusch_pagan_test import timeseries_breusch_pagan_test
from sparktk.frame.ops.timeseries_durbin_watson_test import timeseries_durbin_watson_test
from sparktk.frame.ops.timeseries_from_observations import timeseries_from_observations
from sparktk.frame.ops.timeseries_slice import timeseries_slice
from sparktk.frame.ops.to_pandas import to_pandas
from sparktk.frame.ops.topk import top_k
from sparktk.frame.ops.unflatten_columns import unflatten_columns
Ancestors (in MRO)
- Frame
- __builtin__.object
Instance variables
var column_names
Column identifications in the current frame.
Returns: | list of names of all the frame's columns |
Returns the names of the columns of the current frame.
>>> frame.column_names
[u'name', u'age', u'tenure', u'phone']
var dataframe
pyspark DataFrame (causes conversion through Scala)
var rdd
pyspark RDD (causes conversion if currently backed by a Scala RDD)
var schema
Methods
def __init__(
self, tc, source, schema=None, validate_schema=False)
(Private constructor -- use tc.frame.create or other methods available from the TkContext)
def __init__(self, tc, source, schema=None, validate_schema=False):
"""(Private constructor -- use tc.frame.create or other methods available from the TkContext)"""
self._tc = tc
if self._is_scala_frame(source):
self._frame = source
elif self._is_scala_rdd(source):
scala_schema = schema_to_scala(tc.sc, schema)
self._frame = self._create_scala_frame(tc.sc, source, scala_schema)
elif self._is_scala_dataframe(source):
self._frame = self._create_scala_frame_from_scala_dataframe(tc.sc, source)
elif isinstance(source, DataFrame):
self._frame = self._create_scala_frame_from_scala_dataframe(tc.sc, source._jdf)
elif isinstance(source, PythonFrame):
self._frame = source
else:
if not isinstance(source, RDD):
if not isinstance(source, list) or (len(source) > 0 and any(not isinstance(row, (list, tuple)) for row in source)):
raise TypeError("Invalid data source. The data parameter must be a 2-dimensional list (list of row data) or an RDD.")
inferred_schema = False
if isinstance(schema, list):
if all(isinstance(item, basestring) for item in schema):
# check if schema is just a list of column names (versus string and data type tuples)
schema = self._infer_schema(source, schema)
inferred_schema = True
elif not all(isinstance(item, tuple) and
len(item) == 2 and
isinstance(item[0], basestring) for item in schema):
raise TypeError("Invalid schema. Expected a list of tuples (str, type) with the column name and data type, but received type %s." % type(schema))
# check for duplicate column names
column_names = [col[0] for col in schema]
duplicate_column_names = set([col for col in column_names if column_names.count(col) > 1])
if len(duplicate_column_names) > 0:
raise ValueError("Invalid schema, column names cannot be duplicated: %s" % ", ".join(duplicate_column_names))
elif schema is None:
schema = self._infer_schema(source)
inferred_schema = True
else:
# Schema is not a list or None
raise TypeError("Invalid schema type: %s. Expected a list of tuples (str, type) with the column name and data type." % type(schema))
for item in schema:
if not self._is_supported_datatype(item[1]):
if inferred_schema:
raise TypeError("The %s data type was found when inferring the schema, and it is not a "
"supported data type. Instead, specify a schema that uses a supported data "
"type, and enable validate_schema so that the data is converted to the proper "
"data type.\n\nInferred schema: %s\n\nSupported data types: %s" %
(str(item[1]), str(schema), dtypes.dtypes))
else:
raise TypeError("Invalid schema. %s is not a supported data type.\n\nSupported data types: %s" %
(str(item[1]), dtypes.dtypes))
source = tc.sc.parallelize(source)
if schema and validate_schema:
# Validate schema by going through the data and checking the data type and attempting to parse it
validate_schema_result = self.validate_pyrdd_schema(source, schema)
source = validate_schema_result.validated_rdd
logger.debug("%s values were unable to be parsed to the schema's data type." % validate_schema_result.bad_value_count)
# If schema contains matrix datatype, then apply type_coercer to convert list[list] to numpy ndarray
map_source = MatrixCoercion.schema_is_coercible(source, list(schema))
self._frame = PythonFrame(map_source, schema)
def add_columns(
self, func, schema)
Add columns to current frame.
Assigns data to column based on evaluating a function for each row.
- The row |UDF| ('func') must return a value in the same format as specified by the schema.
func | (UDF): | Function which takes the values in the row and produces a value, or collection of values, for the new cell(s). |
schema | (List[(str,type)]): | Schema for the column(s) being added. |
Given our frame, let's add a column which has how many years the person has been over 18
>>> frame = tc.frame.create([['Fred',39,16,'555-1234'],
... ['Susan',33,3,'555-0202'],
... ['Thurston',65,26,'555-4510'],
... ['Judy',44,14,'555-2183']],
... schema=[('name', str), ('age', int), ('tenure', int), ('phone', str)])
>>> frame.inspect()
[#] name age tenure phone
====================================
[0] Fred 39 16 555-1234
[1] Susan 33 3 555-0202
[2] Thurston 65 26 555-4510
[3] Judy 44 14 555-2183
>>> frame.add_columns(lambda row: row.age - 18, ('adult_years', int))
>>> frame.inspect()
[#] name age tenure phone adult_years
=================================================
[0] Fred 39 16 555-1234 21
[1] Susan 33 3 555-0202 15
[2] Thurston 65 26 555-4510 47
[3] Judy 44 14 555-2183 26
Multiple columns can be added at the same time. Let's add percentage of life and percentage of adult life in one call, which is more efficient.
>>> frame.add_columns(lambda row: [row.tenure / float(row.age), row.tenure / float(row.adult_years)],
... [("of_age", float), ("of_adult", float)])
>>> frame.inspect(round=2)
[#] name age tenure phone adult_years of_age of_adult
===================================================================
[0] Fred 39 16 555-1234 21 0.41 0.76
[1] Susan 33 3 555-0202 15 0.09 0.20
[2] Thurston 65 26 555-4510 47 0.40 0.55
[3] Judy 44 14 555-2183 26 0.32 0.54
Note that the function returns a list, and therefore the schema also needs to be a list.
It is not necessary to use lambda syntax, any function will do, as long as it takes a single row argument. We can also call other local functions within.
Let's add a column which shows the amount of person's name based on their adult tenure percentage.
>>> def percentage_of_string(string, percentage):
... '''returns a substring of the given string according to the given percentage'''
... substring_len = int(percentage * len(string))
... return string[:substring_len]
>>> def add_name_by_adult_tenure(row):
... return percentage_of_string(row.name, row.of_adult)
>>> frame.add_columns(add_name_by_adult_tenure, ('tenured_name', unicode))
>>> frame.inspect(columns=['name', 'of_adult', 'tenured_name'], round=2)
[#] name of_adult tenured_name
=====================================
[0] Fred 0.76 Fre
[1] Susan 0.20 S
[2] Thurston 0.55 Thur
[3] Judy 0.54 Ju
Let's add a name based on tenure percentage of age.
>>> frame.add_columns(lambda row: percentage_of_string(row.name, row.of_age),
... ('tenured_name_age', unicode))
>>> frame.inspect(round=2)
[#] name age tenure phone adult_years of_age of_adult
===================================================================
[0] Fred 39 16 555-1234 21 0.41 0.76
[1] Susan 33 3 555-0202 15 0.09 0.20
[2] Thurston 65 26 555-4510 47 0.40 0.55
[3] Judy 44 14 555-2183 26 0.32 0.54
<BLANKLINE>
[#] tenured_name tenured_name_age
===================================
[0] Fre F
[1] S
[2] Thur Thu
[3] Ju J
def add_columns(self, func, schema):
"""
Add columns to current frame.
Assigns data to column based on evaluating a function for each row.
Notes
-----
1. The row |UDF| ('func') must return a value in the same format as
specified by the schema.
Parameters
----------
:param func: (UDF) Function which takes the values in the row and produces a value, or collection of values, for the new cell(s).
:param schema: (List[(str,type)]) Schema for the column(s) being added.
Examples
--------
Given our frame, let's add a column which has how many years the person has been over 18
>>> frame = tc.frame.create([['Fred',39,16,'555-1234'],
... ['Susan',33,3,'555-0202'],
... ['Thurston',65,26,'555-4510'],
... ['Judy',44,14,'555-2183']],
... schema=[('name', str), ('age', int), ('tenure', int), ('phone', str)])
>>> frame.inspect()
[#] name age tenure phone
====================================
[0] Fred 39 16 555-1234
[1] Susan 33 3 555-0202
[2] Thurston 65 26 555-4510
[3] Judy 44 14 555-2183
>>> frame.add_columns(lambda row: row.age - 18, ('adult_years', int))
>>> frame.inspect()
[#] name age tenure phone adult_years
=================================================
[0] Fred 39 16 555-1234 21
[1] Susan 33 3 555-0202 15
[2] Thurston 65 26 555-4510 47
[3] Judy 44 14 555-2183 26
Multiple columns can be added at the same time. Let's add percentage of
life and percentage of adult life in one call, which is more efficient.
>>> frame.add_columns(lambda row: [row.tenure / float(row.age), row.tenure / float(row.adult_years)],
... [("of_age", float), ("of_adult", float)])
>>> frame.inspect(round=2)
[#] name age tenure phone adult_years of_age of_adult
===================================================================
[0] Fred 39 16 555-1234 21 0.41 0.76
[1] Susan 33 3 555-0202 15 0.09 0.20
[2] Thurston 65 26 555-4510 47 0.40 0.55
[3] Judy 44 14 555-2183 26 0.32 0.54
Note that the function returns a list, and therefore the schema also needs to be a list.
It is not necessary to use lambda syntax, any function will do, as long as it takes a single row argument. We
can also call other local functions within.
Let's add a column which shows the amount of person's name based on their adult tenure percentage.
>>> def percentage_of_string(string, percentage):
... '''returns a substring of the given string according to the given percentage'''
... substring_len = int(percentage * len(string))
... return string[:substring_len]
>>> def add_name_by_adult_tenure(row):
... return percentage_of_string(row.name, row.of_adult)
>>> frame.add_columns(add_name_by_adult_tenure, ('tenured_name', unicode))
>>> frame.inspect(columns=['name', 'of_adult', 'tenured_name'], round=2)
[#] name of_adult tenured_name
=====================================
[0] Fred 0.76 Fre
[1] Susan 0.20 S
[2] Thurston 0.55 Thur
[3] Judy 0.54 Ju
Let's add a name based on tenure percentage of age.
>>> frame.add_columns(lambda row: percentage_of_string(row.name, row.of_age),
... ('tenured_name_age', unicode))
>>> frame.inspect(round=2)
[#] name age tenure phone adult_years of_age of_adult
===================================================================
[0] Fred 39 16 555-1234 21 0.41 0.76
[1] Susan 33 3 555-0202 15 0.09 0.20
[2] Thurston 65 26 555-4510 47 0.40 0.55
[3] Judy 44 14 555-2183 26 0.32 0.54
[#] tenured_name tenured_name_age
===================================
[0] Fre F
[1] S
[2] Thur Thu
[3] Ju J
"""
schema_helper.validate(schema)
schema_helper.validate_is_mergeable(self._tc, self.schema, schema)
row = Row(self.schema)
def add_columns_func(r):
row._set_data(r)
return func(row)
if isinstance(schema, list):
self._python.rdd = self._python.rdd.map(lambda r: r + add_columns_func(r))
self._python.schema.extend(schema)
else:
self._python.rdd = self._python.rdd.map(lambda r: r + [add_columns_func(r)])
self._python.schema.append(schema)
def append(
self, frame)
Adds more data to the current frame.
frame | (Frame): | Frame of data to append to the current frame. |
In this example, we start off by creating a frame of animals.
>>> animals = tc.frame.create([['dog', 'snoopy'],['cat', 'tom'],['bear', 'yogi'],['mouse', 'jerry']],
... [('animal', str), ('name', str)])
[===Job Progress===]
>>> animals.inspect()
[#] animal name
===================
[0] dog snoopy
[1] cat tom
[2] bear yogi
[3] mouse jerry
Then, we append a frame that will add a few more animals to the original frame.
>>> animals.append(tc.frame.create([['donkey'],['elephant'], ['ostrich']], [('animal', str)]))
[===Job Progress===]
>>> animals.inspect()
[#] animal name
=====================
[0] dog snoopy
[1] cat tom
[2] bear yogi
[3] mouse jerry
[4] donkey None
[5] elephant None
[6] ostrich None
The data we added didn't have names, so None values were inserted for the new rows.
def append(self, frame):
"""
Adds more data to the current frame.
Parameters
----------
:param frame: (Frame) Frame of data to append to the current frame.
Examples
--------
In this example, we start off by creating a frame of animals.
>>> animals = tc.frame.create([['dog', 'snoopy'],['cat', 'tom'],['bear', 'yogi'],['mouse', 'jerry']],
... [('animal', str), ('name', str)])
[===Job Progress===]
>>> animals.inspect()
[#] animal name
===================
[0] dog snoopy
[1] cat tom
[2] bear yogi
[3] mouse jerry
Then, we append a frame that will add a few more animals to the original frame.
>>> animals.append(tc.frame.create([['donkey'],['elephant'], ['ostrich']], [('animal', str)]))
[===Job Progress===]
>>> animals.inspect()
[#] animal name
=====================
[0] dog snoopy
[1] cat tom
[2] bear yogi
[3] mouse jerry
[4] donkey None
[5] elephant None
[6] ostrich None
The data we added didn't have names, so None values were inserted for the new rows.
"""
from sparktk.frame.frame import Frame
if not isinstance(frame, Frame):
raise TypeError("frame must be a Frame type, but is: {0}".format(type(frame)))
self._scala.append(frame._scala)
def assign_sample(
self, sample_percentages, sample_labels=None, output_column=None, seed=None)
Randomly group rows into user-defined classes.
sample_percentages | (List[float]): | Entries are non-negative and sum to 1. (See the note below.) If the *i*'th entry of the list is *p*, then then each row receives label *i* with independent probability *p*. |
sample_labels | (Optional[List[str]]): | Names to be used for the split classes. Defaults to 'TR', 'TE', 'VA' when the length of *sample_percentages* is 3, and defaults to Sample_0, Sample_1, ... otherwise. |
output_column | (str): | Name of the new column which holds the labels generated by the function |
seed | (int): | Random seed used to generate the labels. Defaults to 0. |
Randomly assign classes to rows given a vector of percentages.
The table receives an additional column that contains a random label.
The random label is generated by a probability distribution function.
The distribution function is specified by the sample_percentages, a list of
floating point values, which add up to 1.
The labels are non-negative integers drawn from the range
:math:[ 0, len(S) - 1]
where :math:S
is the sample_percentages.
The sample percentages provided by the user are preserved to at least eight decimal places, but beyond this there may be small changes due to floating point imprecision.
In particular:
- The engine validates that the sum of probabilities sums to 1.0 within eight decimal places and returns an error if the sum falls outside of this range.
- The probability of the final class is clamped so that each row receives a valid label with probability one.
Consider this simple frame.
>>> frame.inspect()
[#] blip id
=============
[0] abc 0
[1] def 1
[2] ghi 2
[3] jkl 3
[4] mno 4
[5] pqr 5
[6] stu 6
[7] vwx 7
[8] yza 8
[9] bcd 9
We'll assign labels to each row according to a rough 40-30-30 split, for "train", "test", and "validate".
>>> frame.assign_sample([0.4, 0.3, 0.3])
[===Job Progress===]
>>> frame.inspect()
[#] blip id sample_bin
=========================
[0] abc 0 VA
[1] def 1 TR
[2] ghi 2 TE
[3] jkl 3 TE
[4] mno 4 TE
[5] pqr 5 TR
[6] stu 6 TR
[7] vwx 7 VA
[8] yza 8 VA
[9] bcd 9 VA
Now the frame has a new column named "sample_bin" with a string label. Values in the other columns are unaffected.
Here it is again, this time specifying labels, output column and random seed
>>> frame.assign_sample([0.2, 0.2, 0.3, 0.3],
... ["cat1", "cat2", "cat3", "cat4"],
... output_column="cat",
... seed=12)
[===Job Progress===]
>>> frame.inspect()
[#] blip id sample_bin cat
===============================
[0] abc 0 VA cat4
[1] def 1 TR cat2
[2] ghi 2 TE cat3
[3] jkl 3 TE cat4
[4] mno 4 TE cat1
[5] pqr 5 TR cat3
[6] stu 6 TR cat2
[7] vwx 7 VA cat3
[8] yza 8 VA cat3
[9] bcd 9 VA cat4
def assign_sample(self, sample_percentages,
sample_labels = None,
output_column = None,
seed = None):
"""
Randomly group rows into user-defined classes.
Parameters
----------
:param sample_percentages: (List[float]) Entries are non-negative and sum to 1. (See the note below.)
If the *i*'th entry of the list is *p*, then then each row
receives label *i* with independent probability *p*.
:param sample_labels: (Optional[List[str]]) Names to be used for the split classes. Defaults to 'TR', 'TE',
'VA' when the length of *sample_percentages* is 3, and defaults
to Sample_0, Sample_1, ... otherwise.
:param output_column: (str) Name of the new column which holds the labels generated by the function
:param seed: (int) Random seed used to generate the labels. Defaults to 0.
Randomly assign classes to rows given a vector of percentages.
The table receives an additional column that contains a random label.
The random label is generated by a probability distribution function.
The distribution function is specified by the sample_percentages, a list of
floating point values, which add up to 1.
The labels are non-negative integers drawn from the range
:math:`[ 0, len(S) - 1]` where :math:`S` is the sample_percentages.
Notes
-----
The sample percentages provided by the user are preserved to at least eight
decimal places, but beyond this there may be small changes due to floating
point imprecision.
In particular:
1. The engine validates that the sum of probabilities sums to 1.0 within
eight decimal places and returns an error if the sum falls outside of this
range.
+ The probability of the final class is clamped so that each row receives a
valid label with probability one.
Examples
--------
Consider this simple frame.
>>> frame.inspect()
[#] blip id
=============
[0] abc 0
[1] def 1
[2] ghi 2
[3] jkl 3
[4] mno 4
[5] pqr 5
[6] stu 6
[7] vwx 7
[8] yza 8
[9] bcd 9
We'll assign labels to each row according to a rough 40-30-30 split, for
"train", "test", and "validate".
>>> frame.assign_sample([0.4, 0.3, 0.3])
[===Job Progress===]
>>> frame.inspect()
[#] blip id sample_bin
=========================
[0] abc 0 VA
[1] def 1 TR
[2] ghi 2 TE
[3] jkl 3 TE
[4] mno 4 TE
[5] pqr 5 TR
[6] stu 6 TR
[7] vwx 7 VA
[8] yza 8 VA
[9] bcd 9 VA
Now the frame has a new column named "sample_bin" with a string label.
Values in the other columns are unaffected.
Here it is again, this time specifying labels, output column and random seed
>>> frame.assign_sample([0.2, 0.2, 0.3, 0.3],
... ["cat1", "cat2", "cat3", "cat4"],
... output_column="cat",
... seed=12)
[===Job Progress===]
>>> frame.inspect()
[#] blip id sample_bin cat
===============================
[0] abc 0 VA cat4
[1] def 1 TR cat2
[2] ghi 2 TE cat3
[3] jkl 3 TE cat4
[4] mno 4 TE cat1
[5] pqr 5 TR cat3
[6] stu 6 TR cat2
[7] vwx 7 VA cat3
[8] yza 8 VA cat3
[9] bcd 9 VA cat4
"""
self._scala.assignSample(self._tc.jutils.convert.to_scala_list_double(sample_percentages),
self._tc.jutils.convert.to_scala_option(sample_labels),
self._tc.jutils.convert.to_scala_option(output_column),
self._tc.jutils.convert.to_scala_option(seed))
def bin_column(
self, column_name, bins=None, include_lowest=True, strict_binning=False, bin_column_name=None)
Summarize rows of data based on the value in a single column by sorting them into bins, or groups, based on a list of bin cutoff points or a specified number of equal-width bins.
column_name | (str): | Name of the column to bin |
bins | (Optional[List[float]]): | Either a single value representing the number of equal-width bins to create, or an array of values containing bin cutoff points. Array can be list or tuple. If an array is provided, values must be progressively increasing. All bin boundaries must be included, so, with N bins, you need N+1 values. Default (None or Empty List) is equal-width bins where the maximum number of bins is the Square-root choice :math:`\lfloor \sqrt{m} floor`, where :math:`m` is the number of rows. |
include_lowest | (bool): | Specify how the boundary conditions are handled. ``True`` indicates that the lower bound of the bin is inclusive. ``False`` indicates that the upper bound is inclusive. Default is ``True``. |
strict_binning | (bool): | Specify how values outside of the cutoffs array should be binned. If set to ``True``, each value less than cutoffs[0] or greater than cutoffs[-1] will be assigned a bin value of -1. If set to ``False``, values less than cutoffs[0] will be included in the first bin while values greater than cutoffs[-1] will be included in the final bin. |
bin_column_name | (str): | The name for the new binned column. Default is ``<column_name>_binned`` |
Returns | (List[float]): | a list containing the edges of each bin |
- Bins IDs are 0-index, in other words, the lowest bin number is 0.
- The first and last cutoffs are always included in the bins.
When include_lowest is
True
, the last bin includes both cutoffs. When include_lowest isFalse
, the first bin (bin 0) includes both cutoffs.
For these examples, we will use a frame with column a accessed by a Frame object my_frame:
>>> frame.inspect(n=11)
[##] a
========
[0] 1
[1] 1
[2] 2
[3] 3
[4] 5
[5] 8
[6] 13
[7] 21
[8] 34
[9] 55
[10] 89
Modify the frame with a column showing what bin the data is in, by specifying cutoffs for the bin edges. The data values should use strict_binning:
>>> frame.bin_column('a', [5, 12, 25, 60], include_lowest=True,
... strict_binning=True, bin_column_name='binned_using_cutoffs')
[===Job Progress===]
>>> frame.inspect(n=11)
[##] a binned_using_cutoffs
==============================
[0] 1 -1
[1] 1 -1
[2] 2 -1
[3] 3 -1
[4] 5 0
[5] 8 0
[6] 13 1
[7] 21 1
[8] 34 2
[9] 55 2
[10] 89 -1
Modify the frame with a column showing what bin the data is in. The data value should not use strict_binning:
>>> frame.bin_column('a', [5, 12, 25, 60], include_lowest=True,
... strict_binning=False, bin_column_name='binned_using_cutoffs')
[===Job Progress===]
>>> frame.inspect(n=11)
[##] a binned_using_cutoffs
==============================
[0] 1 0
[1] 1 0
[2] 2 0
[3] 3 0
[4] 5 0
[5] 8 0
[6] 13 1
[7] 21 1
[8] 34 2
[9] 55 2
[10] 89 2
Modify the frame with a column showing what bin the data is in. The bins should be lower inclusive:
>>> frame.bin_column('a', [1,5,34,55,89], include_lowest=True,
... strict_binning=False, bin_column_name='binned_using_cutoffs')
[===Job Progress===]
>>> frame.inspect( n=11 )
[##] a binned_using_cutoffs
==============================
[0] 1 0
[1] 1 0
[2] 2 0
[3] 3 0
[4] 5 1
[5] 8 1
[6] 13 1
[7] 21 1
[8] 34 2
[9] 55 3
[10] 89 3
Modify the frame with a column showing what bin the data is in. The bins should be upper inclusive:
>>> frame.bin_column('a', [1,5,34,55,89], include_lowest=False,
... strict_binning=True, bin_column_name='binned_using_cutoffs')
[===Job Progress===]
>>> frame.inspect( n=11 )
[##] a binned_using_cutoffs
==============================
[0] 1 0
[1] 1 0
[2] 2 0
[3] 3 0
[4] 5 0
[5] 8 1
[6] 13 1
[7] 21 1
[8] 34 1
[9] 55 2
[10] 89 3
Modify the frame with a column of 3 equal-width bins. This also returns the cutoffs that were used for creating the bins.
>>> cutoffs = frame.bin_column('a', 3, bin_column_name='equal_width_bins')
>>> print cutoffs
[1.0, 30.333333333333332, 59.666666666666664, 89.0]
>>> frame.inspect(n=frame.count())
[##] a equal_width_bins
==========================
[0] 1 0
[1] 1 0
[2] 2 0
[3] 3 0
[4] 5 0
[5] 8 0
[6] 13 0
[7] 21 0
[8] 34 1
[9] 55 1
[10] 89 2
def bin_column(self, column_name, bins=None, include_lowest=True, strict_binning=False, bin_column_name=None):
"""
Summarize rows of data based on the value in a single column by sorting them
into bins, or groups, based on a list of bin cutoff points or a specified number of
equal-width bins.
Parameters
----------
:param column_name: (str) Name of the column to bin
:param bins: (Optional[List[float]]) Either a single value representing the number of equal-width bins to create, or an array of values
containing bin cutoff points. Array can be list or tuple. If an array is provided, values must be progressively
increasing. All bin boundaries must be included, so, with N bins, you need N+1 values.
Default (None or Empty List) is equal-width bins where the maximum number of bins is the Square-root choice
:math:`\lfloor \sqrt{m} \rfloor`, where :math:`m` is the number of rows.
:param include_lowest: (bool) Specify how the boundary conditions are handled. ``True`` indicates that the lower bound
of the bin is inclusive. ``False`` indicates that the upper bound is inclusive. Default is ``True``.
:param strict_binning: (bool) Specify how values outside of the cutoffs array should be binned. If set to ``True``, each
value less than cutoffs[0] or greater than cutoffs[-1] will be assigned a bin value of -1. If set to ``False``,
values less than cutoffs[0] will be included in the first bin while values greater than cutoffs[-1] will be
included in the final bin.
:param bin_column_name: (str) The name for the new binned column. Default is ``<column_name>_binned``
:return: (List[float]) a list containing the edges of each bin
Notes
-----
1. Bins IDs are 0-index, in other words, the lowest bin number is 0.
+ The first and last cutoffs are always included in the bins.
When *include_lowest* is ``True``, the last bin includes both cutoffs.
When *include_lowest* is ``False``, the first bin (bin 0) includes both
cutoffs.
Examples
--------
For these examples, we will use a frame with column *a* accessed by a Frame
object *my_frame*:
>>> frame.inspect(n=11)
[##] a
========
[0] 1
[1] 1
[2] 2
[3] 3
[4] 5
[5] 8
[6] 13
[7] 21
[8] 34
[9] 55
[10] 89
Modify the frame with a column showing what bin the data is in, by
specifying cutoffs for the bin edges.
The data values should use strict_binning:
>>> frame.bin_column('a', [5, 12, 25, 60], include_lowest=True,
... strict_binning=True, bin_column_name='binned_using_cutoffs')
[===Job Progress===]
>>> frame.inspect(n=11)
[##] a binned_using_cutoffs
==============================
[0] 1 -1
[1] 1 -1
[2] 2 -1
[3] 3 -1
[4] 5 0
[5] 8 0
[6] 13 1
[7] 21 1
[8] 34 2
[9] 55 2
[10] 89 -1
Modify the frame with a column showing what bin the data is in.
The data value should not use strict_binning:
>>> frame.bin_column('a', [5, 12, 25, 60], include_lowest=True,
... strict_binning=False, bin_column_name='binned_using_cutoffs')
[===Job Progress===]
>>> frame.inspect(n=11)
[##] a binned_using_cutoffs
==============================
[0] 1 0
[1] 1 0
[2] 2 0
[3] 3 0
[4] 5 0
[5] 8 0
[6] 13 1
[7] 21 1
[8] 34 2
[9] 55 2
[10] 89 2
Modify the frame with a column showing what bin the data is in.
The bins should be lower inclusive:
>>> frame.bin_column('a', [1,5,34,55,89], include_lowest=True,
... strict_binning=False, bin_column_name='binned_using_cutoffs')
[===Job Progress===]
>>> frame.inspect( n=11 )
[##] a binned_using_cutoffs
==============================
[0] 1 0
[1] 1 0
[2] 2 0
[3] 3 0
[4] 5 1
[5] 8 1
[6] 13 1
[7] 21 1
[8] 34 2
[9] 55 3
[10] 89 3
Modify the frame with a column showing what bin the data is in.
The bins should be upper inclusive:
>>> frame.bin_column('a', [1,5,34,55,89], include_lowest=False,
... strict_binning=True, bin_column_name='binned_using_cutoffs')
[===Job Progress===]
>>> frame.inspect( n=11 )
[##] a binned_using_cutoffs
==============================
[0] 1 0
[1] 1 0
[2] 2 0
[3] 3 0
[4] 5 0
[5] 8 1
[6] 13 1
[7] 21 1
[8] 34 1
[9] 55 2
[10] 89 3
Modify the frame with a column of 3 equal-width bins. This also
returns the cutoffs that were used for creating the bins.
>>> cutoffs = frame.bin_column('a', 3, bin_column_name='equal_width_bins')
>>> print cutoffs
[1.0, 30.333333333333332, 59.666666666666664, 89.0]
>>> frame.inspect(n=frame.count())
[##] a equal_width_bins
==========================
[0] 1 0
[1] 1 0
[2] 2 0
[3] 3 0
[4] 5 0
[5] 8 0
[6] 13 0
[7] 21 0
[8] 34 1
[9] 55 1
[10] 89 2
"""
if isinstance(bins, tuple):
bins = list(bins)
elif not isinstance(bins, list):
bins = [bins]
return self._tc.jutils.convert.from_scala_seq(self._scala.binColumn(column_name,
self._tc.jutils.convert.to_scala_option_list_double(bins),
include_lowest,
strict_binning,
self._tc.jutils.convert.to_scala_option(bin_column_name)))
def binary_classification_metrics(
self, label_column, pred_column, pos_label, beta=1.0, frequency_column=None)
Statistics of accuracy, precision, and others for a binary classification model.
label_column | (str): | The name of the column containing the correct label for each instance. |
pred_column | (str): | The name of the column containing the predicted label for each instance. |
pos_label | (Any): | The value to be interpreted as a positive instance for binary classification. |
beta | (Optional[float]): | This is the beta value to use for :math:`F_{ eta}` measure (default F1 measure is computed); must be greater than zero. Defaults is 1. |
frequency_column | (Optional[str]): | The name of an optional column containing the frequency of observations. |
Returns | (ClassificationMetricsValue): | The data returned is composed of multiple components: <object>.accuracy : double <object>.confusion_matrix : table <object>.f_measure : double <object>.precision : double <object>.recall : double |
Calculate the accuracy, precision, confusion_matrix, recall and :math:F_{ eta}
measure for a
classification model.
-
The f_measure result is the :math:
F_{ eta}
measure for a classification model. The :math:F_{ eta}
measure of a binary classification model is the harmonic mean of precision and recall. If we let:- beta :math:
\equiv eta
, - :math:
T_{P}
denotes the number of true positives, - :math:
F_{P}
denotes the number of false positives, and - :math:
F_{N}
denotes the number of false negatives
then:
.. math::
F_{ eta} = (1 + eta ^ 2) * rac{ rac{T_{P}}{T_{P} + F_{P}} * rac{T_{P}}{T_{P} + F_{N}}}{ eta ^ 2 * rac{T_{P}}{T_{P} + F_{P}} + rac{T_{P}}{T_{P} + F_{N}}}
The :math:
F_{ eta}
measure for a multi-class classification model is computed as the weighted average of the :math:F_{ eta}
measure for each label, where the weight is the number of instances of each label. The determination of binary vs. multi-class is automatically inferred from the data.-
The recall result of a binary classification model is the proportion of positive instances that are correctly identified. If we let :math:
T_{P}
denote the number of true positives and :math:F_{N}
denote the number of false negatives, then the model recall is given by :math:rac {T_{P}} {T_{P} + F_{N}}
. -
The precision of a binary classification model is the proportion of predicted positive instances that are correctly identified. If we let :math:
T_{P}
denote the number of true positives and :math:F_{P}
denote the number of false positives, then the model precision is given by: :math:rac {T_{P}} {T_{P} + F_{P}}
. -
The accuracy of a classification model is the proportion of predictions that are correctly identified. If we let :math:
T_{P}
denote the number of true positives, :math:T_{N}
denote the number of true negatives, and :math:K
denote the total number of classified instances, then the model accuracy is given by: :math:rac{T_{P} + T_{N}}{K}
.
- beta :math:
-
The confusion_matrix result is a confusion matrix for a binary classifier model, formatted for human readability.
Consider Frame my_frame, which contains the data
>>> my_frame.inspect()
[#] a b labels predictions
==================================
[0] red 1 0 0
[1] blue 3 1 0
[2] green 1 0 0
[3] green 0 1 1
>>> cm = my_frame.binary_classification_metrics('labels', 'predictions', 1, 1)
[===Job Progress===]
>>> cm.f_measure
0.6666666666666666
>>> cm.recall
0.5
>>> cm.accuracy
0.75
>>> cm.precision
1.0
>>> cm.confusion_matrix
Predicted_Pos Predicted_Neg
Actual_Pos 1 1
Actual_Neg 0 2
def binary_classification_metrics(self, label_column, pred_column, pos_label, beta=1.0, frequency_column=None):
"""
Statistics of accuracy, precision, and others for a binary classification model.
Parameters
----------
:param label_column: (str) The name of the column containing the correct label for each instance.
:param pred_column: (str) The name of the column containing the predicted label for each instance.
:param pos_label: (Any) The value to be interpreted as a positive instance for binary classification.
:param beta: (Optional[float]) This is the beta value to use for :math:`F_{ \beta}` measure (default F1 measure is computed);
must be greater than zero. Defaults is 1.
:param frequency_column: (Optional[str]) The name of an optional column containing the frequency of observations.
:return: (ClassificationMetricsValue) The data returned is composed of multiple components:
<object>.accuracy : double
<object>.confusion_matrix : table
<object>.f_measure : double
<object>.precision : double
<object>.recall : double
Calculate the accuracy, precision, confusion_matrix, recall and :math:`F_{ \beta}` measure for a
classification model.
* The **f_measure** result is the :math:`F_{ \beta}` measure for a
classification model.
The :math:`F_{ \beta}` measure of a binary classification model is the
harmonic mean of precision and recall.
If we let:
* beta :math:`\equiv \beta`,
* :math:`T_{P}` denotes the number of true positives,
* :math:`F_{P}` denotes the number of false positives, and
* :math:`F_{N}` denotes the number of false negatives
then:
.. math::
F_{ \beta} = (1 + \beta ^ 2) * \frac{ \frac{T_{P}}{T_{P} + F_{P}} * \
\frac{T_{P}}{T_{P} + F_{N}}}{ \beta ^ 2 * \frac{T_{P}}{T_{P} + \
F_{P}} + \frac{T_{P}}{T_{P} + F_{N}}}
The :math:`F_{ \beta}` measure for a multi-class classification model is
computed as the weighted average of the :math:`F_{ \beta}` measure for
each label, where the weight is the number of instances of each label.
The determination of binary vs. multi-class is automatically inferred
from the data.
* The **recall** result of a binary classification model is the proportion
of positive instances that are correctly identified.
If we let :math:`T_{P}` denote the number of true positives and
:math:`F_{N}` denote the number of false negatives, then the model
recall is given by :math:`\frac {T_{P}} {T_{P} + F_{N}}`.
* The **precision** of a binary classification model is the proportion of
predicted positive instances that are correctly identified.
If we let :math:`T_{P}` denote the number of true positives and
:math:`F_{P}` denote the number of false positives, then the model
precision is given by: :math:`\frac {T_{P}} {T_{P} + F_{P}}`.
* The **accuracy** of a classification model is the proportion of
predictions that are correctly identified.
If we let :math:`T_{P}` denote the number of true positives,
:math:`T_{N}` denote the number of true negatives, and :math:`K` denote
the total number of classified instances, then the model accuracy is
given by: :math:`\frac{T_{P} + T_{N}}{K}`.
* The **confusion_matrix** result is a confusion matrix for a
binary classifier model, formatted for human readability.
Examples
--------
Consider Frame *my_frame*, which contains the data
>>> my_frame.inspect()
[#] a b labels predictions
==================================
[0] red 1 0 0
[1] blue 3 1 0
[2] green 1 0 0
[3] green 0 1 1
>>> cm = my_frame.binary_classification_metrics('labels', 'predictions', 1, 1)
[===Job Progress===]
>>> cm.f_measure
0.6666666666666666
>>> cm.recall
0.5
>>> cm.accuracy
0.75
>>> cm.precision
1.0
>>> cm.confusion_matrix
Predicted_Pos Predicted_Neg
Actual_Pos 1 1
Actual_Neg 0 2
"""
return ClassificationMetricsValue(self._tc, self._scala.binaryClassificationMetrics(label_column,
pred_column,
pos_label,
float(beta),
self._tc.jutils.convert.to_scala_option(frequency_column)))
def box_cox(
self, column_name, lambda_value=0.0, box_cox_column_name=None)
Calculate the box-cox transformation for each row on a given column of the current frame
column_name: | Name of the column to perform the transformation on |
lambda_value: | Lambda power parameter. Default is 0.0 |
box_cox_column_name: | Optional column name for the box_cox value |
Returns | (Frame): | returns a frame with a new column storing the box-cox transformed value |
Calculate the box-cox transformation for each row in column 'column_name' of a frame using the lambda_value.
Box-cox transformation is computed by the following formula:
boxcox = log(y); if lambda=0, boxcox = (y^lambda -1)/lambda ; else where log is the natural log
>>> data = [[7.7132064326674596],[0.207519493594015],[6.336482349262754],[7.4880388253861181],[4.9850701230259045]]
>>> schema = [("input", float)]
>>> my_frame = tc.frame.create(data, schema)
>>> my_frame.inspect()
[#] input
===================
[0] 7.71320643267
[1] 0.207519493594
[2] 6.33648234926
[3] 7.48803882539
[4] 4.98507012303
Compute the box-cox transformation on the 'input' column
>>> my_frame.box_cox('input',0.3)
A new column gets added to the frame which stores the box-cox transformation for each row
>>> my_frame.inspect()
[#] input input_lambda_0.3
=====================================
[0] 7.71320643267 2.81913279907
[1] 0.207519493594 -1.25365381375
[2] 6.33648234926 2.46673638752
[3] 7.48803882539 2.76469126003
[4] 4.98507012303 2.06401101556
def box_cox(self, column_name, lambda_value=0.0, box_cox_column_name=None):
"""
Calculate the box-cox transformation for each row on a given column of the current frame
Parameters
----------
:param column_name: Name of the column to perform the transformation on
:param lambda_value: Lambda power parameter. Default is 0.0
:param box_cox_column_name: Optional column name for the box_cox value
:return: (Frame) returns a frame with a new column storing the box-cox transformed value
Calculate the box-cox transformation for each row in column 'column_name' of a frame using the lambda_value.
Box-cox transformation is computed by the following formula:
boxcox = log(y); if lambda=0,
boxcox = (y^lambda -1)/lambda ; else
where log is the natural log
Examples
--------
>>> data = [[7.7132064326674596],[0.207519493594015],[6.336482349262754],[7.4880388253861181],[4.9850701230259045]]
>>> schema = [("input", float)]
>>> my_frame = tc.frame.create(data, schema)
>>> my_frame.inspect()
[#] input
===================
[0] 7.71320643267
[1] 0.207519493594
[2] 6.33648234926
[3] 7.48803882539
[4] 4.98507012303
Compute the box-cox transformation on the 'input' column
>>> my_frame.box_cox('input',0.3)
A new column gets added to the frame which stores the box-cox transformation for each row
>>> my_frame.inspect()
[#] input input_lambda_0.3
=====================================
[0] 7.71320643267 2.81913279907
[1] 0.207519493594 -1.25365381375
[2] 6.33648234926 2.46673638752
[3] 7.48803882539 2.76469126003
[4] 4.98507012303 2.06401101556
"""
self._scala.boxCox(column_name, lambda_value, self._tc.jutils.convert.to_scala_option(box_cox_column_name))
def categorical_summary(
self, columns, top_k=None, threshold=None)
Build summary of the data.
columns | (List[CategoricalSummaryInput]): | List of CategoricalSummaryInput consisting of column, topk and/or threshold |
top_k | (Optional[int]): | Displays levels which are in the top k most frequently occurring values for that column. Default is 10. |
threshold | (Optional[float]): | Displays levels which are above the threshold percentage with respect to the total row count. Default is 0.0. |
Returns | (List[CategoricalSummaryOutput]): | List of CategoricalSummaryOutput objects for specified column(s) consisting of levels with their frequency and percentage. |
Compute a summary of the data in a column(s) for categorical or numerical data types. The returned value is a Map containing categorical summary for each specified column.
For each column, levels which satisfy the top k and/or threshold cutoffs are displayed along with their frequency and percentage occurrence with respect to the total rows in the dataset.
Performs level pruning first based on top k and then filters out levels which satisfy the threshold criterion.
Missing data is reported when a column value is empty ("") or null.
All remaining data is grouped together in the Other category and its frequency and percentage are reported as well.
User must specify the column name and can optionally specify top_k and/or threshold.
Consider Frame my_frame, which contains the data
>>> my_frame.inspect()
[#] source target
=====================================
[0] entity thing
[1] entity physical_entity
[2] entity abstraction
[3] physical_entity entity
[4] physical_entity matter
[5] physical_entity process
[6] physical_entity thing
[7] physical_entity substance
[8] physical_entity object
[9] physical_entity causal_agent
>>> cm = my_frame.categorical_summary('source', top_k=2)
[===Job Progress===]
>>> cm
column_name = "source"
[#] level frequency percentage
===========================================
[0] thing 9 0.321428571429
[1] abstraction 9 0.321428571429
[2] <Missing> 0 0.0
[3] <Other> 10 0.357142857143
>>> cm = my_frame.categorical_summary('source', threshold = 0.5)
[===Job Progress===]
>>> cm
column_name = "source"
[#] level frequency percentage
=====================================
[0] <Missing> 0 0.0
[1] <Other> 28 1.0
>>> cm = my_frame.categorical_summary(['source', 'target'], top_k=[2, None], threshold=[None, 0.5])
[===Job Progress===]
>>> cm
column_name = "source"
[#] level frequency percentage
===========================================
[0] thing 9 0.321428571429
[1] abstraction 9 0.321428571429
[2] <Missing> 0 0.0
[3] <Other> 10 0.357142857143
<BLANKLINE>
column_name = "target"
[#] level frequency percentage
=====================================
[0] <Missing> 0 0.0
[1] <Other> 28 1.0
def categorical_summary(self, columns, top_k=None, threshold=None):
"""
Build summary of the data.
Parameters
----------
:param columns: (List[CategoricalSummaryInput]) List of CategoricalSummaryInput consisting of column, topk and/or threshold
:param top_k: (Optional[int]) Displays levels which are in the top k most frequently
occurring values for that column.
Default is 10.
:param threshold: (Optional[float]) Displays levels which are above the threshold percentage with
respect to the total row count.
Default is 0.0.
:return: (List[CategoricalSummaryOutput]) List of CategoricalSummaryOutput objects for specified column(s) consisting of levels with
their frequency and percentage.
Compute a summary of the data in a column(s) for categorical or numerical data types.
The returned value is a Map containing categorical summary for each specified column.
For each column, levels which satisfy the top k and/or threshold cutoffs are
displayed along with their frequency and percentage occurrence with respect to
the total rows in the dataset.
Performs level pruning first based on top k and then filters
out levels which satisfy the threshold criterion.
Missing data is reported when a column value is empty ("") or null.
All remaining data is grouped together in the Other category and its frequency
and percentage are reported as well.
User must specify the column name and can optionally specify top_k and/or threshold.
Examples
--------
Consider Frame *my_frame*, which contains the data
>>> my_frame.inspect()
[#] source target
=====================================
[0] entity thing
[1] entity physical_entity
[2] entity abstraction
[3] physical_entity entity
[4] physical_entity matter
[5] physical_entity process
[6] physical_entity thing
[7] physical_entity substance
[8] physical_entity object
[9] physical_entity causal_agent
>>> cm = my_frame.categorical_summary('source', top_k=2)
[===Job Progress===]
>>> cm
column_name = "source"
[#] level frequency percentage
===========================================
[0] thing 9 0.321428571429
[1] abstraction 9 0.321428571429
[2] 0 0.0
[3] 10 0.357142857143
>>> cm = my_frame.categorical_summary('source', threshold = 0.5)
[===Job Progress===]
>>> cm
column_name = "source"
[#] level frequency percentage
=====================================
[0] 0 0.0
[1] 28 1.0
>>> cm = my_frame.categorical_summary(['source', 'target'], top_k=[2, None], threshold=[None, 0.5])
[===Job Progress===]
>>> cm
column_name = "source"
[#] level frequency percentage
===========================================
[0] thing 9 0.321428571429
[1] abstraction 9 0.321428571429
[2] 0 0.0
[3] 10 0.357142857143
column_name = "target"
[#] level frequency percentage
=====================================
[0] 0 0.0
[1] 28 1.0
"""
if not isinstance(columns, list):
columns = [columns]
columns = self._tc.jutils.convert.to_scala_list_string(columns)
if top_k is not None:
if not isinstance(top_k, list):
top_k = [top_k]
top_k = [self._tc.jutils.convert.to_scala_option(item) for item in top_k]
top_k = self._tc.jutils.convert.to_scala_list(top_k)
if threshold is not None:
if not isinstance(threshold, list):
threshold = [threshold]
threshold = [self._tc.jutils.convert.to_scala_option(item) for item in threshold]
threshold = self._tc.jutils.convert.to_scala_list(threshold)
result_list = list(self._scala.categoricalSummary(columns,
self._tc.jutils.convert.to_scala_option(top_k),
self._tc.jutils.convert.to_scala_option(threshold)))
return CategoricalSummaryOutputList([CategoricalSummaryOutput(item) for item in result_list])
def collect(
self, columns=None)
Brings all the rows of data from the frame into a local python list of lists
(Use the 'take' operation for control over row count and offset of the collected data)
columns | (Optional[str or List[str]): | If not None, only the given columns' data will be provided. By default, all columns are included. |
Returns | (List[List[*]]): | the frame data represented as a list of lists |
>>> schema = [('name',str), ('age', int), ('tenure', int), ('phone', str)]
>>> rows = [['Fred', 39, 16, '555-1234'], ['Susan', 33, 3, '555-0202'], ['Thurston', 65, 26, '555-4510'], ['Judy', 44, 14, '555-2183']]
>>> frame = tc.frame.create(rows, schema)
>>> frame.collect()
[['Fred', 39, 16, '555-1234'], ['Susan', 33, 3, '555-0202'], ['Thurston', 65, 26, '555-4510'], ['Judy', 44, 14, '555-2183']]
>>> frame.collect(['name', 'phone'])
[['Fred', '555-1234'], ['Susan', '555-0202'], ['Thurston', '555-4510'], ['Judy', '555-2183']]
def collect(self, columns=None):
"""
Brings all the rows of data from the frame into a local python list of lists
(Use the 'take' operation for control over row count and offset of the collected data)
Parameters
----------
:param columns: (Optional[str or List[str]) If not None, only the given columns' data will be provided.
By default, all columns are included.
:return: (List[List[*]]) the frame data represented as a list of lists
Examples
--------
>>> schema = [('name',str), ('age', int), ('tenure', int), ('phone', str)]
>>> rows = [['Fred', 39, 16, '555-1234'], ['Susan', 33, 3, '555-0202'], ['Thurston', 65, 26, '555-4510'], ['Judy', 44, 14, '555-2183']]
>>> frame = tc.frame.create(rows, schema)
>>> frame.collect()
[['Fred', 39, 16, '555-1234'], ['Susan', 33, 3, '555-0202'], ['Thurston', 65, 26, '555-4510'], ['Judy', 44, 14, '555-2183']]
>>> frame.collect(['name', 'phone'])
[['Fred', '555-1234'], ['Susan', '555-0202'], ['Thurston', '555-4510'], ['Judy', '555-2183']]
"""
if columns is not None:
affirm_type.list_of_str(columns, "columns")
if not columns:
return []
if self._is_scala:
scala_data = self._scala.collect(self._tc.jutils.convert.to_scala_option_list_string(columns))
schema = get_schema_for_columns(self.schema, columns) if columns else self.schema
data = TakeCollectHelper.scala_rows_to_python(self._tc, scala_data, schema)
else:
if columns:
select = TakeCollectHelper.get_select_columns_function(self.schema, columns)
data = self._python.rdd.map(select).collect()
else:
data = self._python.rdd.collect()
return data
def column_median(
self, data_column, weights_column=None)
Calculate the (weighted) median of a column.
The median is the least value X in the range of the distribution so that the cumulative weight of values strictly below X is strictly less than half of the total weight and the cumulative weight of values up to and including X is greater than or equal to one-half of the total weight.
All data elements of weight less than or equal to 0 are excluded from the calculation, as are all data elements whose weight is NaN or infinite. If a weight column is provided and no weights are finite numbers greater than 0, None is returned.
data_column | (str): | The column whose median is to be calculated. |
weights_column | (Option[str]): | The column that provides weights (frequencies) for the median calculation. Must contain numerical data. Default is all items have a weight of 1. |
Returns | (varies): | The median of the values. If a weight column is provided and no weights are finite numbers greater than 0, None is returned. The type of the median returned is the same as the contents of the data column, so a column of Longs will result in a Long median and a column of Floats will result in a Float median. |
Given a frame with column 'a' accessed by a Frame object 'my_frame':
>>> data = [[2],[3],[3],[5],[7],[10],[30]]
>>> schema = [('a', int)]
>>> my_frame = tc.frame.create(data, schema)
[===Job Progress===]
Inspect my_frame
>>> my_frame.inspect()
[#] a
=======
[0] 2
[1] 3
[2] 3
[3] 5
[4] 7
[5] 10
[6] 30
Compute and return middle number of values in column a:
>>> median = my_frame.column_median('a')
[===Job Progress===]
>>> print median
5
Given a frame with column 'a' and column 'w' as weights accessed by a Frame object 'my_frame':
>>> data = [[2,1.7],[3,0.5],[3,1.2],[5,0.8],[7,1.1],[10,0.8],[30,0.1]]
>>> schema = [('a', int), ('w', float)]
>>> my_frame = tc.frame.create(data, schema)
[===Job Progress===]
Inspect my_frame
>>> my_frame.inspect()
[#] a w
============
[0] 2 1.7
[1] 3 0.5
[2] 3 1.2
[3] 5 0.8
[4] 7 1.1
[5] 10 0.8
[6] 30 0.1
Compute and return middle number of values in column 'a' with weights 'w':
>>> median = my_frame.column_median('a', weights_column='w')
[===Job Progress===]
>>> print median
3
def column_median(self, data_column, weights_column=None):
"""
Calculate the (weighted) median of a column.
The median is the least value X in the range of the distribution so that
the cumulative weight of values strictly below X is strictly less than half
of the total weight and the cumulative weight of values up to and including X
is greater than or equal to one-half of the total weight.
All data elements of weight less than or equal to 0 are excluded from the
calculation, as are all data elements whose weight is NaN or infinite.
If a weight column is provided and no weights are finite numbers greater
than 0, None is returned.
Parameters
----------
:param data_column: (str) The column whose median is to be calculated.
:param weights_column: (Option[str]) The column that provides weights (frequencies) for the median calculation.
Must contain numerical data.
Default is all items have a weight of 1.
:return: (varies) The median of the values.
If a weight column is provided and no weights are finite numbers greater
than 0, None is returned.
The type of the median returned is the same as the contents of the data
column, so a column of Longs will result in a Long median and a column of
Floats will result in a Float median.
Examples
--------
Given a frame with column 'a' accessed by a Frame object 'my_frame':
>>> data = [[2],[3],[3],[5],[7],[10],[30]]
>>> schema = [('a', int)]
>>> my_frame = tc.frame.create(data, schema)
[===Job Progress===]
Inspect my_frame
>>> my_frame.inspect()
[#] a
=======
[0] 2
[1] 3
[2] 3
[3] 5
[4] 7
[5] 10
[6] 30
Compute and return middle number of values in column *a*:
>>> median = my_frame.column_median('a')
[===Job Progress===]
>>> print median
5
Given a frame with column 'a' and column 'w' as weights accessed by a Frame object 'my_frame':
>>> data = [[2,1.7],[3,0.5],[3,1.2],[5,0.8],[7,1.1],[10,0.8],[30,0.1]]
>>> schema = [('a', int), ('w', float)]
>>> my_frame = tc.frame.create(data, schema)
[===Job Progress===]
Inspect my_frame
>>> my_frame.inspect()
[#] a w
============
[0] 2 1.7
[1] 3 0.5
[2] 3 1.2
[3] 5 0.8
[4] 7 1.1
[5] 10 0.8
[6] 30 0.1
Compute and return middle number of values in column 'a' with weights 'w':
>>> median = my_frame.column_median('a', weights_column='w')
[===Job Progress===]
>>> print median
3
"""
val = self._scala.columnMedian(data_column, self._tc.jutils.convert.to_scala_option(weights_column))
optional_val = self._tc.jutils.convert.from_scala_option(val)
if optional_val is None:
return None
else:
return optional_val.value()
def column_mode(
self, data_column, weights_column=None, max_modes_returned=None)
Evaluate the weights assigned to rows.
Calculate the modes of a column. A mode is a data element of maximum weight. All data elements of weight less than or equal to 0 are excluded from the calculation, as are all data elements whose weight is NaN or infinite. If there are no data elements of finite weight greater than 0, no mode is returned.
Because data distributions often have multiple modes, it is possible for a set of modes to be returned. By default, only one is returned, but by setting the optional parameter max_modes_returned, a larger number of modes can be returned.
data_column | (str): | Name of the column supplying the data. |
weights_column | (Optional[str]): | Name of the column supplying the weights. Default is all items have weight of 1. |
max_modes_returned | (Option[int]): | Maximum number of modes returned. Default is 1. |
Returns | (ColumnMode): | ColumnMode object which includes multiple components (mode, weight_of_mode, total_weight, and mode_count). |
The data returned is composed of multiple components\:
mode : A mode is a data element of maximum net weight. A set of modes is returned. The empty set is returned when the sum of the weights is 0. If the number of modes is less than or equal to the parameter max_modes_returned, then all modes of the data are returned. If the number of modes is greater than the max_modes_returned parameter, only the first max_modes_returned many modes (per a canonical ordering) are returned. weight_of_mode : Weight of a mode. If there are no data elements of finite weight greater than 0, the weight of the mode is 0. If no weights column is given, this is the number of appearances of each mode. total_weight : Sum of all weights in the weight column. This is the row count if no weights are given. If no weights column is given, this is the number of rows in the table with non-zero weight. mode_count : The number of distinct modes in the data. In the case that the data is very multimodal, this number may exceed max_modes_returned.
Given a frame with column 'a' accessed by a Frame object 'my_frame':
>>> data = [[2],[3],[3],[5],[7],[10],[30]]
>>> schema = [('a', int)]
>>> my_frame = tc.frame.create(data, schema)
[===Job Progress===]
Inspect my_frame
>>> my_frame.inspect()
[#] a
=======
[0] 2
[1] 3
[2] 3
[3] 5
[4] 7
[5] 10
[6] 30
Compute and return a ColumnMode object containing summary statistics of column a:
>>> mode = my_frame.column_mode('a')
[===Job Progress===]
>>> print mode
mode_count = 1
modes = [3]
total_weight = 7.0
weight_of_mode = 2.0
Given a frame with column 'a' and column 'w' as weights accessed by a Frame object 'my_frame':
>>> data = [[2,1.7],[3,0.5],[3,1.2],[5,0.8],[7,1.1],[10,0.8],[30,0.1]]
>>> schema = [('a', int), ('w', float)]
>>> my_frame = tc.frame.create(data, schema)
[===Job Progress===]
Inspect my_frame
>>> my_frame.inspect()
[#] a w
============
[0] 2 1.7
[1] 3 0.5
[2] 3 1.2
[3] 5 0.8
[4] 7 1.1
[5] 10 0.8
[6] 30 0.1
Compute and return ColumnMode object containing summary statistics of column 'a' with weights 'w':
>>> mode = my_frame.column_mode('a', weights_column='w')
[===Job Progress===]
>>> print mode
mode_count = 2
modes = [2]
total_weight = 6.2
weight_of_mode = 1.7
def column_mode(self, data_column, weights_column=None, max_modes_returned=None):
"""
Evaluate the weights assigned to rows.
Calculate the modes of a column.
A mode is a data element of maximum weight.
All data elements of weight less than or equal to 0 are excluded from the
calculation, as are all data elements whose weight is NaN or infinite.
If there are no data elements of finite weight greater than 0,
no mode is returned.
Because data distributions often have multiple modes, it is possible for a
set of modes to be returned.
By default, only one is returned, but by setting the optional parameter
max_modes_returned, a larger number of modes can be returned.
Parameters
----------
:param data_column: (str) Name of the column supplying the data.
:param weights_column: (Optional[str]) Name of the column supplying the weights.
Default is all items have weight of 1.
:param max_modes_returned: (Option[int]) Maximum number of modes returned. Default is 1.
:return: (ColumnMode) ColumnMode object which includes multiple components (mode, weight_of_mode, total_weight,
and mode_count).
The data returned is composed of multiple components\:
mode : A mode is a data element of maximum net weight.
A set of modes is returned.
The empty set is returned when the sum of the weights is 0.
If the number of modes is less than or equal to the parameter
max_modes_returned, then all modes of the data are
returned.
If the number of modes is greater than the max_modes_returned
parameter, only the first max_modes_returned many modes (per a
canonical ordering) are returned.
weight_of_mode : Weight of a mode.
If there are no data elements of finite weight greater than 0,
the weight of the mode is 0.
If no weights column is given, this is the number of appearances
of each mode.
total_weight : Sum of all weights in the weight column.
This is the row count if no weights are given.
If no weights column is given, this is the number of rows in
the table with non-zero weight.
mode_count : The number of distinct modes in the data.
In the case that the data is very multimodal, this number may
exceed max_modes_returned.
Examples
--------
Given a frame with column 'a' accessed by a Frame object 'my_frame':
>>> data = [[2],[3],[3],[5],[7],[10],[30]]
>>> schema = [('a', int)]
>>> my_frame = tc.frame.create(data, schema)
[===Job Progress===]
Inspect my_frame
>>> my_frame.inspect()
[#] a
=======
[0] 2
[1] 3
[2] 3
[3] 5
[4] 7
[5] 10
[6] 30
Compute and return a ColumnMode object containing summary statistics of column *a*:
>>> mode = my_frame.column_mode('a')
[===Job Progress===]
>>> print mode
mode_count = 1
modes = [3]
total_weight = 7.0
weight_of_mode = 2.0
Given a frame with column 'a' and column 'w' as weights accessed by a Frame object 'my_frame':
>>> data = [[2,1.7],[3,0.5],[3,1.2],[5,0.8],[7,1.1],[10,0.8],[30,0.1]]
>>> schema = [('a', int), ('w', float)]
>>> my_frame = tc.frame.create(data, schema)
[===Job Progress===]
Inspect my_frame
>>> my_frame.inspect()
[#] a w
============
[0] 2 1.7
[1] 3 0.5
[2] 3 1.2
[3] 5 0.8
[4] 7 1.1
[5] 10 0.8
[6] 30 0.1
Compute and return ColumnMode object containing summary statistics of column 'a' with weights 'w':
>>> mode = my_frame.column_mode('a', weights_column='w')
[===Job Progress===]
>>> print mode
mode_count = 2
modes = [2]
total_weight = 6.2
weight_of_mode = 1.7
"""
return ColumnMode(self._scala.columnMode(data_column,
self._tc.jutils.convert.to_scala_option(weights_column),
self._tc.jutils.convert.to_scala_option(max_modes_returned)))
def column_summary_statistics(
self, data_column, weights_column=None, use_popultion_variance=False)
Calculate multiple statistics for a column.
data_column | (str): | The column to be statistically summarized. Must contain numerical data; all NaNs and infinite values are excluded from the calculation. |
weights_column | (Optional[str]): | Name of column holding weights of column values. |
use_popultion_variance | (Optional[bool]): | If true, the variance is calculated as the population variance. If false, the variance calculated as the sample variance. Because this option affects the variance, it affects the standard deviation and the confidence intervals as well. Default is false. |
Returns | (ColumnSummaryStatistics): | ColumnSummaryStatistics object containing summary statistics. |
The data returned is composed of multiple components:
- mean : [ double | None ]
Arithmetic mean of the data. - geometric_mean : [ double | None ]
Geometric mean of the data. None when there is a data element <= 0, 1.0 when there are no data elements. - variance : [ double | None ]
None when there are <= 1 many data elements. Sample variance is the weighted sum of the squared distance of each data element from the weighted mean, divided by the total weight minus 1. None when the sum of the weights is <= 1. Population variance is the weighted sum of the squared distance of each data element from the weighted mean, divided by the total weight. - standard_deviation : [ double | None ]
The square root of the variance. None when sample variance is being used and the sum of weights is <= 1. - total_weight : long
The count of all data elements that are finite numbers. In other words, after excluding NaNs and infinite values. - minimum : [ double | None ]
Minimum value in the data. None when there are no data elements. - maximum : [ double | None ]
Maximum value in the data. None when there are no data elements. - mean_confidence_lower : [ double | None ]
Lower limit of the 95% confidence interval about the mean. Assumes a Gaussian distribution. None when there are no elements of positive weight. - mean_confidence_upper : [ double | None ]
Upper limit of the 95% confidence interval about the mean. Assumes a Gaussian distribution. None when there are no elements of positive weight. - bad_row_count : [ double | None ]
The number of rows containing a NaN or infinite value in either the data or weights column. - good_row_count : [ double | None ]
The number of rows not containing a NaN or infinite value in either the data or weights column. - positive_weight_count : [ double | None ]
The number of valid data elements with weight > 0. This is the number of entries used in the statistical calculation. - non_positive_weight_count : [ double | None ]
The number valid data elements with finite weight <= 0.
-
Sample Variance
Sample Variance is computed by the following formula:.. math::
\left( rac{1}{W - 1}
ight) * sum_{i} \left(x_{i} - M ight) ^{2}
where :math:
W
is sum of weights over valid elements of positive weight, and :math:M
is the weighted mean. -
Population Variance
Population Variance is computed by the following formula:.. math::
\left( rac{1}{W}
ight) * sum_{i} \left(x_{i} - M ight) ^{2}
where :math:
W
is sum of weights over valid elements of positive weight, and :math:M
is the weighted mean. -
Standard Deviation
The square root of the variance. -
Logging Invalid Data
A row is bad when it contains a NaN or infinite value in either its data or weights column. In this case, it contributes to bad_row_count; otherwise it contributes to good row count.A good row can be skipped because the value in its weight column is less than or equal to 0. In this case, it contributes to non_positive_weight_count, otherwise (when the weight is greater than 0) it contributes to valid_data_weight_pair_count.
Equations
bad_row_count + good_row_count = # rows in the frame
positive_weight_count + non_positive_weight_count = good_row_count
In particular, when no weights column is provided and all weights are 1.0:
non_positive_weight_count = 0 and
positive_weight_count = good_row_count
Given a frame with column 'a' accessed by a Frame object 'my_frame':
>>> data = [[2],[3],[3],[5],[7],[10],[30]]
>>> schema = [('a', int)]
>>> my_frame = tc.frame.create(data, schema)
[===Job Progress===]
Inspect my_frame
>>> my_frame.inspect()
[#] a
=======
[0] 2
[1] 3
[2] 3
[3] 5
[4] 7
[5] 10
[6] 30
Compute and return summary statistics for values in column a:
>>> summary_statistics = my_frame.column_summary_statistics('a')
[===Job Progress===]
>>> print summary_statistics
bad_row_count = 0
geometric_mean = 5.67257514519
good_row_count = 7
maximum = 30.0
mean = 8.57142857143
mean_confidence_lower = 1.27708372993
mean_confidence_upper = 15.8657734129
minimum = 2.0
non_positive_weight_count = 0
positive_weight_count = 7
standard_deviation = 9.84644001416
total_weight = 7.0
variance = 96.9523809524
Given a frame with column 'a' and column 'w' as weights accessed by a Frame object 'my_frame':
>>> data = [[2,1.7],[3,0.5],[3,1.2],[5,0.8],[7,1.1],[10,0.8],[30,0.1]]
>>> schema = [('a', int), ('w', float)]
>>> my_frame = tc.frame.create(data, schema)
[===Job Progress===]
Inspect my_frame
>>> my_frame.inspect()
[#] a w
============
[0] 2 1.7
[1] 3 0.5
[2] 3 1.2
[3] 5 0.8
[4] 7 1.1
[5] 10 0.8
[6] 30 0.1
Compute and return summary statistics values in column 'a' with weights 'w':
>>> summary_statistics = my_frame.column_summary_statistics('a', weights_column='w')
[===Job Progress===]
>>> print summary_statistics
bad_row_count = 0
geometric_mean = 4.03968288152
good_row_count = 7
maximum = 30.0
mean = 5.03225806452
mean_confidence_lower = 1.42847242276
mean_confidence_upper = 8.63604370627
minimum = 2.0
non_positive_weight_count = 0
positive_weight_count = 7
standard_deviation = 4.57824177679
total_weight = 6.2
variance = 20.9602977667
def column_summary_statistics(self, data_column, weights_column=None, use_popultion_variance=False):
"""
Calculate multiple statistics for a column.
Parameters
----------
:param data_column: (str) The column to be statistically summarized.
Must contain numerical data; all NaNs and infinite values are excluded from the calculation.
:param weights_column: (Optional[str]) Name of column holding weights of column values.
:param use_popultion_variance: (Optional[bool]) If true, the variance is calculated as the population variance.
If false, the variance calculated as the sample variance.
Because this option affects the variance, it affects the standard deviation and
the confidence intervals as well.
Default is false.
:return: (ColumnSummaryStatistics) ColumnSummaryStatistics object containing summary statistics.
The data returned is composed of multiple components:
* mean : [ double | None ]
Arithmetic mean of the data.
* geometric_mean : [ double | None ]
Geometric mean of the data. None when there is a data element <= 0, 1.0 when there are no data elements.
* variance : [ double | None ]
None when there are <= 1 many data elements. Sample variance is the weighted sum of the squared distance of each data element from the weighted mean, divided by the total weight minus 1. None when the sum of the weights is <= 1. Population variance is the weighted sum of the squared distance of each data element from the weighted mean, divided by the total weight.
* standard_deviation : [ double | None ]
The square root of the variance. None when sample variance is being used and the sum of weights is <= 1.
* total_weight : long
The count of all data elements that are finite numbers. In other words, after excluding NaNs and infinite values.
* minimum : [ double | None ]
Minimum value in the data. None when there are no data elements.
* maximum : [ double | None ]
Maximum value in the data. None when there are no data elements.
* mean_confidence_lower : [ double | None ]
Lower limit of the 95% confidence interval about the mean. Assumes a Gaussian distribution. None when there are no elements of positive weight.
* mean_confidence_upper : [ double | None ]
Upper limit of the 95% confidence interval about the mean. Assumes a Gaussian distribution. None when there are no elements of positive weight.
* bad_row_count : [ double | None ]
The number of rows containing a NaN or infinite value in either the data or weights column.
* good_row_count : [ double | None ]
The number of rows not containing a NaN or infinite value in either the data or weights column.
* positive_weight_count : [ double | None ]
The number of valid data elements with weight > 0. This is the number of entries used in the statistical calculation.
* non_positive_weight_count : [ double | None ]
The number valid data elements with finite weight <= 0.
Notes
-----
* Sample Variance
Sample Variance is computed by the following formula:
.. math::
\left( \frac{1}{W - 1} \right) * sum_{i} \
\left(x_{i} - M \right) ^{2}
where :math:`W` is sum of weights over valid elements of positive
weight, and :math:`M` is the weighted mean.
* Population Variance
Population Variance is computed by the following formula:
.. math::
\left( \frac{1}{W} \right) * sum_{i} \
\left(x_{i} - M \right) ^{2}
where :math:`W` is sum of weights over valid elements of positive
weight, and :math:`M` is the weighted mean.
* Standard Deviation
The square root of the variance.
* Logging Invalid Data
A row is bad when it contains a NaN or infinite value in either
its data or weights column.
In this case, it contributes to bad_row_count; otherwise it
contributes to good row count.
A good row can be skipped because the value in its weight
column is less than or equal to 0.
In this case, it contributes to non_positive_weight_count, otherwise
(when the weight is greater than 0) it contributes to
valid_data_weight_pair_count.
**Equations**
bad_row_count + good_row_count = # rows in the frame
positive_weight_count + non_positive_weight_count = good_row_count
In particular, when no weights column is provided and all weights are 1.0:
non_positive_weight_count = 0 and
positive_weight_count = good_row_count
Examples
--------
Given a frame with column 'a' accessed by a Frame object 'my_frame':
>>> data = [[2],[3],[3],[5],[7],[10],[30]]
>>> schema = [('a', int)]
>>> my_frame = tc.frame.create(data, schema)
[===Job Progress===]
Inspect my_frame
>>> my_frame.inspect()
[#] a
=======
[0] 2
[1] 3
[2] 3
[3] 5
[4] 7
[5] 10
[6] 30
Compute and return summary statistics for values in column *a*:
>>> summary_statistics = my_frame.column_summary_statistics('a')
[===Job Progress===]
>>> print summary_statistics
bad_row_count = 0
geometric_mean = 5.67257514519
good_row_count = 7
maximum = 30.0
mean = 8.57142857143
mean_confidence_lower = 1.27708372993
mean_confidence_upper = 15.8657734129
minimum = 2.0
non_positive_weight_count = 0
positive_weight_count = 7
standard_deviation = 9.84644001416
total_weight = 7.0
variance = 96.9523809524
Given a frame with column 'a' and column 'w' as weights accessed by a Frame object 'my_frame':
>>> data = [[2,1.7],[3,0.5],[3,1.2],[5,0.8],[7,1.1],[10,0.8],[30,0.1]]
>>> schema = [('a', int), ('w', float)]
>>> my_frame = tc.frame.create(data, schema)
[===Job Progress===]
Inspect my_frame
>>> my_frame.inspect()
[#] a w
============
[0] 2 1.7
[1] 3 0.5
[2] 3 1.2
[3] 5 0.8
[4] 7 1.1
[5] 10 0.8
[6] 30 0.1
Compute and return summary statistics values in column 'a' with weights 'w':
>>> summary_statistics = my_frame.column_summary_statistics('a', weights_column='w')
[===Job Progress===]
>>> print summary_statistics
bad_row_count = 0
geometric_mean = 4.03968288152
good_row_count = 7
maximum = 30.0
mean = 5.03225806452
mean_confidence_lower = 1.42847242276
mean_confidence_upper = 8.63604370627
minimum = 2.0
non_positive_weight_count = 0
positive_weight_count = 7
standard_deviation = 4.57824177679
total_weight = 6.2
variance = 20.9602977667
"""
return ColumnSummaryStatistics(self._scala.columnSummaryStatistics(data_column,
self._tc.jutils.convert.to_scala_option(weights_column),
use_popultion_variance))
def copy(
self, columns=None, where=None)
New frame with copied columns.
columns | (str, List[str], or dictionary(str,str)): | If not None, the copy will only include the columns specified. If dict, the string pairs represent a column renaming { source_column_name : destination_column_name } |
where | (UDF): | Optionally provide a where function. If not None, only those rows for which the UDF evaluates to True will be copied. |
Returns | (Frame): | New Frame object. |
Copies specified columns into a new Frame object, optionally renaming them and/or filtering them. Useful for frame query.
Consider the following frame of employee names, age, and years of service:
>>> frame.inspect()
[#] name age years
=========================
[0] Thurston 64 26
[1] Judy 44 14
[2] Emily 37 5
[3] Frank 50 18
[4] Joe 43 11
[5] Ruth 52 21
>>> frame.schema
[('name', <type 'str'>), ('age', <type 'int'>), ('years', <type 'int'>)]
To create a duplicate copy of the frame, use the copy operation with no parameters:
>>> duplicate = frame.copy()
[===Job Progress===]
>>> duplicate.inspect()
[#] name age years
=========================
[0] Thurston 64 26
[1] Judy 44 14
[2] Emily 37 5
[3] Frank 50 18
[4] Joe 43 11
[5] Ruth 52 21
Using the copy operation, we can also limit the new frame to just include the 'name' column:
>>> names = frame.copy("name")
[===Job Progress===]
>>> names.inspect()
[#] name
=============
[0] Thurston
[1] Judy
[2] Emily
[3] Frank
[4] Joe
[5] Ruth
We could also include a UDF to filter the data that is included in the new frame, and also provide a dictionary to rename the column(s) in the new frame. Here we will use copy to create a frame of names for the employees that have over 20 years of service and also rename of the 'name' column to 'first_name':
>>> names = frame.copy({"name" : "first_name"}, lambda row: row.years > 20)
[===Job Progress===]
>>> names.inspect()
[#] first_name
===============
[0] Thurston
[1] Ruth
def copy(self, columns=None, where=None):
"""
New frame with copied columns.
Parameters
----------
:param columns: (str, List[str], or dictionary(str,str)) If not None, the copy will only include the
columns specified. If dict, the string pairs represent a column renaming
{ source_column_name : destination_column_name }
:param where: (UDF) Optionally provide a where function. If not None, only those rows for which the UDF
evaluates to True will be copied.
:return: (Frame) New Frame object.
Copies specified columns into a new Frame object, optionally renaming them and/or filtering them.
Useful for frame query.
Examples
--------
Consider the following frame of employee names, age, and years of service:
>>> frame.inspect()
[#] name age years
=========================
[0] Thurston 64 26
[1] Judy 44 14
[2] Emily 37 5
[3] Frank 50 18
[4] Joe 43 11
[5] Ruth 52 21
>>> frame.schema
[('name', ), ('age', ), ('years', )]
To create a duplicate copy of the frame, use the copy operation with no parameters:
>>> duplicate = frame.copy()
[===Job Progress===]
>>> duplicate.inspect()
[#] name age years
=========================
[0] Thurston 64 26
[1] Judy 44 14
[2] Emily 37 5
[3] Frank 50 18
[4] Joe 43 11
[5] Ruth 52 21
Using the copy operation, we can also limit the new frame to just include the 'name' column:
>>> names = frame.copy("name")
[===Job Progress===]
>>> names.inspect()
[#] name
=============
[0] Thurston
[1] Judy
[2] Emily
[3] Frank
[4] Joe
[5] Ruth
We could also include a UDF to filter the data that is included in the new frame, and also provide
a dictionary to rename the column(s) in the new frame. Here we will use copy to create a frame of
names for the employees that have over 20 years of service and also rename of the 'name' column to
'first_name':
>>> names = frame.copy({"name" : "first_name"}, lambda row: row.years > 20)
[===Job Progress===]
>>> names.inspect()
[#] first_name
===============
[0] Thurston
[1] Ruth
"""
new_rdd = self._python.rdd
if where is not None and not isinstance(where, types.FunctionType):
raise ValueError("Unsupported type for 'where' parameter. Must be a function or None, but is: {0}".format(type(where)))
if isinstance(columns, str):
columns = [columns]
if isinstance(columns, list):
column_indices = [i for i, column in enumerate(self._python.schema) if column[0] in columns]
elif isinstance(columns, dict):
column_indices = [i for i, column in enumerate(self._python.schema) if column[0] in columns.keys()]
elif columns is None:
column_indices = xrange(0, len(self._python.schema))
else:
raise ValueError("Unsupported type for 'columns' parameter. Expected str, list, dict, or None, but was: {0}".format(type(columns)))
if where is not None:
# If a udf is provided, apply that function and apply the new schema
row = Row(self._python.schema)
def copy_func(r):
row._set_data(r)
return where(row)
new_rdd = self._python.rdd.filter(lambda r: copy_func(r))
if len(column_indices) < len(self._python.schema):
# Map rows to only include the specified columns
row = Row(self._python.schema)
def map_func(r):
row._set_data(r)
return list(row[i] for i in column_indices)
new_rdd = new_rdd.map(lambda r: map_func(r))
new_schema = list(self._python.schema[i] for i in column_indices)
# If columns are being renamed through a dictionary, alter the schema
if (isinstance(columns, dict)):
renamed_schema = []
for column in new_schema:
if columns.has_key(column[0]):
new_name = columns[column[0]]
renamed_schema.append((new_name, column[1]))
new_schema = renamed_schema
# return new frame with the filtered rdd and new schema
return self._tc.frame.create(new_rdd, new_schema)
def correlation(
self, column_a, column_b)
Calculate correlation for two columns of current frame.
column_a | (str): | The name of the column from which to compute the correlation. |
column_b | (str): | The name of the column from which to compute the correlation. |
Returns | (float): | Pearson correlation coefficient of the two columns. |
This method applies only to columns containing numerical data.
Consider Frame my_frame, which contains the data
>>> my_frame.inspect()
[#] idnum x1 x2 x3 x4
===============================
[0] 0 1.0 4.0 0.0 -1.0
[1] 1 2.0 3.0 0.0 -1.0
[2] 2 3.0 2.0 1.0 -1.0
[3] 3 4.0 1.0 2.0 -1.0
[4] 4 5.0 0.0 2.0 -1.0
my_frame.correlation computes the common correlation coefficient (Pearson's) on the pair of columns provided. In this example, the idnum and most of the columns have trivial correlations: -1, 0, or +1. Column x3 provides a contrasting coefficient of 3 / sqrt(3) = 0.948683298051 .
>>> my_frame.correlation("x1", "x2")
-0.9999999999999998
>>> my_frame.correlation("x1", "x4")
nan
>>> my_frame.correlation("x2", "x3")
-0.9486832980505138
def correlation(self, column_a, column_b):
"""
Calculate correlation for two columns of current frame.
Parameters
----------
:param column_a: (str) The name of the column from which to compute the correlation.
:param column_b: (str) The name of the column from which to compute the correlation.
:return: (float) Pearson correlation coefficient of the two columns.
Notes
-----
This method applies only to columns containing numerical data.
Examples
--------
Consider Frame *my_frame*, which contains the data
>>> my_frame.inspect()
[#] idnum x1 x2 x3 x4
===============================
[0] 0 1.0 4.0 0.0 -1.0
[1] 1 2.0 3.0 0.0 -1.0
[2] 2 3.0 2.0 1.0 -1.0
[3] 3 4.0 1.0 2.0 -1.0
[4] 4 5.0 0.0 2.0 -1.0
my_frame.correlation computes the common correlation coefficient (Pearson's) on the pair
of columns provided.
In this example, the *idnum* and most of the columns have trivial correlations: -1, 0, or +1.
Column *x3* provides a contrasting coefficient of 3 / sqrt(3) = 0.948683298051 .
>>> my_frame.correlation("x1", "x2")
-0.9999999999999998
>>> my_frame.correlation("x1", "x4")
nan
>>> my_frame.correlation("x2", "x3")
-0.9486832980505138
"""
return self._scala.correlation(column_a, column_b)
def correlation_matrix(
self, data_column_names)
Calculate correlation matrix for two or more columns.
data_column_names | (List[str]): | The names of the columns from which to compute the matrix. |
Returns | (Frame): | A Frame with the matrix of the correlation values for the columns. |
This method applies only to columns containing numerical data.
Consider Frame my_frame, which contains the data
>>> my_frame.inspect()
[#] idnum x1 x2 x3 x4
===============================
[0] 0 1.0 4.0 0.0 -1.0
[1] 1 2.0 3.0 0.0 -1.0
[2] 2 3.0 2.0 1.0 -1.0
[3] 3 4.0 1.0 2.0 -1.0
[4] 4 5.0 0.0 2.0 -1.0
my_frame.correlation_matrix computes the common correlation coefficient (Pearson's) on each pair of columns in the user-provided list. In this example, the idnum and most of the columns have trivial correlations: -1, 0, or +1. Column x3 provides a contrasting coefficient of 3 / sqrt(3) = 0.948683298051
>>> corr_matrix = my_frame.correlation_matrix(my_frame.column_names)
[===Job Progress===]
The resulting table (specifying all columns) is:
>>> corr_matrix.inspect()
[#] idnum x1 x2 x3 x4
==========================================================================
[0] 1.0 1.0 -1.0 0.948683298051 nan
[1] 1.0 1.0 -1.0 0.948683298051 nan
[2] -1.0 -1.0 1.0 -0.948683298051 nan
[3] 0.948683298051 0.948683298051 -0.948683298051 1.0 nan
[4] nan nan nan nan 1.0
def correlation_matrix(self, data_column_names):
"""
Calculate correlation matrix for two or more columns.
Parameters
----------
:param data_column_names: (List[str]) The names of the columns from which to compute the matrix.
:return: (Frame) A Frame with the matrix of the correlation values for the columns.
Notes
-----
This method applies only to columns containing numerical data.
Examples
--------
Consider Frame *my_frame*, which contains the data
>>> my_frame.inspect()
[#] idnum x1 x2 x3 x4
===============================
[0] 0 1.0 4.0 0.0 -1.0
[1] 1 2.0 3.0 0.0 -1.0
[2] 2 3.0 2.0 1.0 -1.0
[3] 3 4.0 1.0 2.0 -1.0
[4] 4 5.0 0.0 2.0 -1.0
my_frame.correlation_matrix computes the common correlation coefficient (Pearson's) on each pair
of columns in the user-provided list.
In this example, the *idnum* and most of the columns have trivial correlations: -1, 0, or +1.
Column *x3* provides a contrasting coefficient of 3 / sqrt(3) = 0.948683298051
>>> corr_matrix = my_frame.correlation_matrix(my_frame.column_names)
[===Job Progress===]
The resulting table (specifying all columns) is:
>>> corr_matrix.inspect()
[#] idnum x1 x2 x3 x4
==========================================================================
[0] 1.0 1.0 -1.0 0.948683298051 nan
[1] 1.0 1.0 -1.0 0.948683298051 nan
[2] -1.0 -1.0 1.0 -0.948683298051 nan
[3] 0.948683298051 0.948683298051 -0.948683298051 1.0 nan
[4] nan nan nan nan 1.0
"""
from sparktk.frame.frame import Frame
return Frame(self._tc,
self._scala.correlationMatrix(self._tc.jutils.convert.to_scala_list_string(data_column_names)))
def count(
self, where=None)
Counts all rows or all qualified rows.
where | (UDF): | Optional function which evaluates a row to a boolean to determine if it should be counted |
Returns | (int): | Number of rows counted |
Counts all rows or all rows which meet criteria specified by a UDF predicate.
>>> frame = tc.frame.create([['Fred',39,16,'555-1234'],
... ['Susan',33,3,'555-0202'],
... ['Thurston',65,26,'555-4510'],
... ['Judy',44,14,'555-2183']],
... schema=[('name', str), ('age', int), ('tenure', int), ('phone', str)])
>>> frame.inspect()
[#] name age tenure phone
====================================
[0] Fred 39 16 555-1234
[1] Susan 33 3 555-0202
[2] Thurston 65 26 555-4510
[3] Judy 44 14 555-2183
>>> frame.count()
4
>>> frame.count(lambda row: row.age > 35)
3
def count(self, where=None):
"""
Counts all rows or all qualified rows.
Parameters
----------
:param where: (UDF) Optional function which evaluates a row to a boolean to determine if it should be counted
:return: (int) Number of rows counted
Counts all rows or all rows which meet criteria specified by a UDF predicate.
Examples
--------
>>> frame = tc.frame.create([['Fred',39,16,'555-1234'],
... ['Susan',33,3,'555-0202'],
... ['Thurston',65,26,'555-4510'],
... ['Judy',44,14,'555-2183']],
... schema=[('name', str), ('age', int), ('tenure', int), ('phone', str)])
>>> frame.inspect()
[#] name age tenure phone
====================================
[0] Fred 39 16 555-1234
[1] Susan 33 3 555-0202
[2] Thurston 65 26 555-4510
[3] Judy 44 14 555-2183
>>> frame.count()
4
>>> frame.count(lambda row: row.age > 35)
3
"""
if where:
row = Row(self.schema)
def count_where(r):
row._set_data(r)
return where(row)
return self._python.rdd.filter(lambda r: count_where(r)).count()
else:
if self._is_scala:
return int(self._scala.rowCount())
return self.rdd.count()
def covariance(
self, column_a, column_b)
Calculate covariance for exactly two columns.
column_a | (str): | The name of the column from which to compute the covariance. |
column_b | (str): | The name of the column from which to compute the covariance. |
Returns | (float): | Covariance of the two columns. |
This method applies only to columns containing numerical data.
Consider Frame my_frame, which contains the data
>>> my_frame.inspect()
[#] idnum x1 x2 x3 x4
===============================
[0] 0 1.0 4.0 0.0 -1.0
[1] 1 2.0 3.0 0.0 -1.0
[2] 2 3.0 2.0 1.0 -1.0
[3] 3 4.0 1.0 2.0 -1.0
[4] 4 5.0 0.0 2.0 -1.0
my_frame.covariance computes the covariance on the pair of columns provided.
>>> my_frame.covariance("x1", "x2")
-2.5
>>> my_frame.covariance("x1", "x4")
0.0
>>> my_frame.covariance("x2", "x3")
-1.5
def covariance(self, column_a, column_b):
"""
Calculate covariance for exactly two columns.
Parameters
----------
:param column_a: (str) The name of the column from which to compute the covariance.
:param column_b: (str) The name of the column from which to compute the covariance.
:return: (float) Covariance of the two columns.
Notes
-----
This method applies only to columns containing numerical data.
Examples
--------
Consider Frame *my_frame*, which contains the data
>>> my_frame.inspect()
[#] idnum x1 x2 x3 x4
===============================
[0] 0 1.0 4.0 0.0 -1.0
[1] 1 2.0 3.0 0.0 -1.0
[2] 2 3.0 2.0 1.0 -1.0
[3] 3 4.0 1.0 2.0 -1.0
[4] 4 5.0 0.0 2.0 -1.0
my_frame.covariance computes the covariance on the pair of columns provided.
>>> my_frame.covariance("x1", "x2")
-2.5
>>> my_frame.covariance("x1", "x4")
0.0
>>> my_frame.covariance("x2", "x3")
-1.5
"""
return self._scala.covariance(column_a, column_b)
def covariance_matrix(
self, data_column_names)
Calculate covariance matrix for two or more columns.
data_column_names | (List[str]): | The names of the column from which to compute the matrix. Names should refer to a single column of type vector, or two or more columns of numeric scalars. |
Returns | (Frame): | A matrix with the covariance values for the columns. |
This function applies only to columns containing numerical data.
Consider Frame my_frame, which contains the data
>>> my_frame.inspect()
[#] idnum x1 x2 x3 x4
===============================
[0] 0 1.0 4.0 0.0 -1.0
[1] 1 2.0 3.0 0.0 -1.0
[2] 2 3.0 2.0 1.0 -1.0
[3] 3 4.0 1.0 2.0 -1.0
[4] 4 5.0 0.0 2.0 -1.0
my_frame.covariance_matrix computes the covariance on each pair of columns in the user-provided list.
>>> cov_matrix = my_frame.covariance_matrix(my_frame.column_names)
[===Job Progress===]
The resulting table (specifying all columns) is:
>>> cov_matrix.inspect()
[#] idnum x1 x2 x3 x4
=================================
[0] 2.5 2.5 -2.5 1.5 0.0
[1] 2.5 2.5 -2.5 1.5 0.0
[2] -2.5 -2.5 2.5 -1.5 0.0
[3] 1.5 1.5 -1.5 1.0 0.0
[4] 0.0 0.0 0.0 0.0 0.0
def covariance_matrix(self, data_column_names):
"""
Calculate covariance matrix for two or more columns.
Parameters
----------
:param data_column_names: (List[str]) The names of the column from which to compute the matrix.
Names should refer to a single column of type vector, or two or more
columns of numeric scalars.
:return: (Frame) A matrix with the covariance values for the columns.
Notes
-----
This function applies only to columns containing numerical data.
Examples
--------
Consider Frame *my_frame*, which contains the data
>>> my_frame.inspect()
[#] idnum x1 x2 x3 x4
===============================
[0] 0 1.0 4.0 0.0 -1.0
[1] 1 2.0 3.0 0.0 -1.0
[2] 2 3.0 2.0 1.0 -1.0
[3] 3 4.0 1.0 2.0 -1.0
[4] 4 5.0 0.0 2.0 -1.0
my_frame.covariance_matrix computes the covariance on each pair of columns in the user-provided list.
>>> cov_matrix = my_frame.covariance_matrix(my_frame.column_names)
[===Job Progress===]
The resulting table (specifying all columns) is:
>>> cov_matrix.inspect()
[#] idnum x1 x2 x3 x4
=================================
[0] 2.5 2.5 -2.5 1.5 0.0
[1] 2.5 2.5 -2.5 1.5 0.0
[2] -2.5 -2.5 2.5 -1.5 0.0
[3] 1.5 1.5 -1.5 1.0 0.0
[4] 0.0 0.0 0.0 0.0 0.0
"""
from sparktk.frame.frame import Frame
return Frame(self._tc,
self._scala.covarianceMatrix(self._tc.jutils.convert.to_scala_list_string(data_column_names)))
def cumulative_percent(
self, sample_col)
Add column to frame with cumulative percent.
sample_col | (str): | The name of the column from which to compute the cumulative percent. |
A cumulative percent sum is computed by sequentially stepping through the rows, observing the column values and keeping track of the current percentage of the total sum accounted for at the current value.
This method applies only to columns containing numerical data. Although this method will execute for columns containing negative values, the interpretation of the result will change (for example, negative percentages).
Consider Frame my_frame accessing a frame that contains a single column named obs:
>>> my_frame.inspect()
[#] obs
========
[0] 0
[1] 1
[2] 2
[3] 0
[4] 1
[5] 2
The cumulative percent sum for column obs is obtained by:
>>> my_frame.cumulative_percent('obs')
[===Job Progress===]
The Frame my_frame now contains two columns obs and obsCumulativePercentSum. They contain the original data and the cumulative percent sum, respectively:
>>> my_frame.inspect()
[#] obs obs_cumulative_percent
================================
[0] 0 0.0
[1] 1 0.166666666667
[2] 2 0.5
[3] 0 0.5
[4] 1 0.666666666667
[5] 2 1.0
def cumulative_percent(self, sample_col):
"""
Add column to frame with cumulative percent.
Parameters
----------
:param sample_col: (str) The name of the column from which to compute the cumulative percent.
A cumulative percent sum is computed by sequentially stepping through the rows,
observing the column values and keeping track of the current percentage of the
total sum accounted for at the current value.
Notes
-----
This method applies only to columns containing numerical data.
Although this method will execute for columns containing negative
values, the interpretation of the result will change (for example,
negative percentages).
Examples
--------
Consider Frame *my_frame* accessing a frame that contains a single
column named *obs*:
>>> my_frame.inspect()
[#] obs
========
[0] 0
[1] 1
[2] 2
[3] 0
[4] 1
[5] 2
The cumulative percent sum for column *obs* is obtained by:
>>> my_frame.cumulative_percent('obs')
[===Job Progress===]
The Frame *my_frame* now contains two columns *obs* and
*obsCumulativePercentSum*.
They contain the original data and the cumulative percent sum,
respectively:
>>> my_frame.inspect()
[#] obs obs_cumulative_percent
================================
[0] 0 0.0
[1] 1 0.166666666667
[2] 2 0.5
[3] 0 0.5
[4] 1 0.666666666667
[5] 2 1.0
"""
self._scala.cumulativePercent(sample_col)
def cumulative_sum(
self, sample_col)
Add column to frame with cumulative sum.
sample_col | (str): | The name of the column from which to compute the cumulative sum. |
A cumulative sum is computed by sequentially stepping through the rows, observing the column values and keeping track of the cumulative sum for each value.
This method applies only to columns containing numerical data.
Consider Frame my_frame, which accesses a frame that contains a single column named obs:
>>> my_frame.inspect()
[#] obs
========
[0] 0
[1] 1
[2] 2
[3] 0
[4] 1
[5] 2
The cumulative sum for column obs is obtained by:
>>> my_frame.cumulative_sum('obs')
[===Job Progress===]
The Frame my_frame accesses the original frame that now contains two columns, obs that contains the original column values, and obsCumulativeSum that contains the cumulative percent count:
>>> my_frame.inspect()
[#] obs obs_cumulative_sum
============================
[0] 0 0.0
[1] 1 1.0
[2] 2 3.0
[3] 0 3.0
[4] 1 4.0
[5] 2 6.0
def cumulative_sum(self, sample_col):
"""
Add column to frame with cumulative sum.
Parameters
----------
:param sample_col: (str) The name of the column from which to compute the cumulative sum.
A cumulative sum is computed by sequentially stepping through the rows,
observing the column values and keeping track of the cumulative sum for each value.
Notes
-----
This method applies only to columns containing numerical data.
Examples
--------
Consider Frame *my_frame*, which accesses a frame that contains a single
column named *obs*:
>>> my_frame.inspect()
[#] obs
========
[0] 0
[1] 1
[2] 2
[3] 0
[4] 1
[5] 2
The cumulative sum for column *obs* is obtained by:
>>> my_frame.cumulative_sum('obs')
[===Job Progress===]
The Frame *my_frame* accesses the original frame that now contains two
columns, *obs* that contains the original column values, and
*obsCumulativeSum* that contains the cumulative percent count:
>>> my_frame.inspect()
[#] obs obs_cumulative_sum
============================
[0] 0 0.0
[1] 1 1.0
[2] 2 3.0
[3] 0 3.0
[4] 1 4.0
[5] 2 6.0
"""
self._scala.cumulativeSum(sample_col)
def dot_product(
self, left_column_names, right_column_names, dot_product_column_name, default_left_values=None, default_right_values=None)
Calculate dot product for each row in current frame.
left_column_names | (List[str]): | Names of columns used to create the left vector (A) for each row. Names should refer to a single column of type vector, or two or more columns of numeric scalars. |
right_column_names | (List[str]): | Names of columns used to create right vector (B) for each row. Names should refer to a single column of type vector, or two or more columns of numeric scalars. |
dot_product_column_name | (str): | Name of column used to store the dot product. |
default_left_values | (Optional[List[float]): | Default values used to substitute null values in left vector.Default is None. |
default_right_values | (Optional[List[float]): | Default values used to substitute null values in right vector.Default is None. |
Returns | (Frame): | returns a frame with give "dot_product" column name |
Calculate the dot product for each row in a frame using values from two equal-length sequences of columns.
Dot product is computed by the following formula:
The dot product of two vectors :math:A=[a_1, a_2, ..., a_n]
and :math:B =[b_1, b_2, ..., b_n]
is :math:a_1*b_1 + a_2*b_2 + ...+ a_n*b_n
.
The dot product for each row is stored in a new column in the existing frame.
- If default_left_values or default_right_values are not specified, any null values will be replaced by zeros.
- This method applies only to columns containing numerical data.
>>> data = [[1, 0.2, -2, 5], [2, 0.4, -1, 6], [3, 0.6, 0, 7], [4, 0.8, 1, 8]]
>>> schema = [('col_0', int), ('col_1', float),('col_2', int) ,('col_3', int)]
>>> my_frame = tc.frame.create(data, schema)
[===Job Progress===]
Calculate the dot product for a sequence of columns in Frame object my_frame:
>>> my_frame.inspect()
[#] col_0 col_1 col_2 col_3
===============================
[0] 1 0.2 -2 5
[1] 2 0.4 -1 6
[2] 3 0.6 0 7
[3] 4 0.8 1 8
Modify the frame by computing the dot product for a sequence of columns:
>>> my_frame.dot_product(['col_0','col_1'], ['col_2', 'col_3'], 'dot_product')
[===Job Progress===]
>>> my_frame.inspect()
[#] col_0 col_1 col_2 col_3 dot_product
============================================
[0] 1 0.2 -2 5 -1.0
[1] 2 0.4 -1 6 0.4
[2] 3 0.6 0 7 4.2
[3] 4 0.8 1 8 10.4
def dot_product(self, left_column_names,right_column_names,dot_product_column_name,default_left_values=None,default_right_values=None):
"""
Calculate dot product for each row in current frame.
Parameters
----------
:param left_column_names: (List[str]) Names of columns used to create the left vector (A) for each row.
Names should refer to a single column of type vector, or two or more columns of numeric scalars.
:param right_column_names: (List[str]) Names of columns used to create right vector (B) for each row.
Names should refer to a single column of type vector, or two or more columns of numeric scalars.
:param dot_product_column_name: (str) Name of column used to store the dot product.
:param default_left_values: (Optional[List[float]) Default values used to substitute null values in left vector.Default is None.
:param default_right_values: (Optional[List[float]) Default values used to substitute null values in right vector.Default is None.
:return: (Frame) returns a frame with give "dot_product" column name
Calculate the dot product for each row in a frame using values from two equal-length sequences of columns.
Dot product is computed by the following formula:
The dot product of two vectors :math:`A=[a_1, a_2, ..., a_n]` and :math:`B =[b_1, b_2, ..., b_n]` is :math:`a_1*b_1 + a_2*b_2 + ...+ a_n*b_n`.
The dot product for each row is stored in a new column in the existing frame.
Notes
-----
* If default_left_values or default_right_values are not specified, any null values will be replaced by zeros.
* This method applies only to columns containing numerical data.
Examples
--------
>>> data = [[1, 0.2, -2, 5], [2, 0.4, -1, 6], [3, 0.6, 0, 7], [4, 0.8, 1, 8]]
>>> schema = [('col_0', int), ('col_1', float),('col_2', int) ,('col_3', int)]
>>> my_frame = tc.frame.create(data, schema)
[===Job Progress===]
Calculate the dot product for a sequence of columns in Frame object *my_frame*:
>>> my_frame.inspect()
[#] col_0 col_1 col_2 col_3
===============================
[0] 1 0.2 -2 5
[1] 2 0.4 -1 6
[2] 3 0.6 0 7
[3] 4 0.8 1 8
Modify the frame by computing the dot product for a sequence of columns:
>>> my_frame.dot_product(['col_0','col_1'], ['col_2', 'col_3'], 'dot_product')
[===Job Progress===]
>>> my_frame.inspect()
[#] col_0 col_1 col_2 col_3 dot_product
============================================
[0] 1 0.2 -2 5 -1.0
[1] 2 0.4 -1 6 0.4
[2] 3 0.6 0 7 4.2
[3] 4 0.8 1 8 10.4
"""
if not isinstance(left_column_names, list):
left_column_names = [left_column_names]
if not isinstance(right_column_names, list):
right_column_names = [right_column_names]
self._scala.dotProduct(self._tc.jutils.convert.to_scala_list_string(left_column_names),
self._tc.jutils.convert.to_scala_list_string(right_column_names),
dot_product_column_name,
self._tc.jutils.convert.to_scala_option_list_double(default_left_values),
self._tc.jutils.convert.to_scala_option_list_double(default_right_values))
def drop_columns(
self, columns)
Drops columns from the frame
columns | (str or List[str]): | names of the columns to drop |
For this example, the Frame object my_frame accesses a frame with 4 columns columns column_a, column_b, column_c and column_d and drops 2 columns column_b and column_d using drop columns.
>>> print frame.schema
[('column_a', <type 'str'>), ('column_b', <type 'int'>), ('column_c', <type 'str'>), ('column_d', <type 'int'>)]
Eliminate columns column_b and column_d:
>>> frame.drop_columns(["column_b", "column_d"])
>>> print frame.schema
[('column_a', <type 'str'>), ('column_c', <type 'str'>)]
Now the frame only has the columns column_a and column_c.
For further examples, see: ref:example_frame.drop_columns
.
def drop_columns(self, columns):
"""
Drops columns from the frame
Parameters
----------
:param columns: (str or List[str]) names of the columns to drop
Examples
--------
For this example, the Frame object *my_frame* accesses a frame with 4 columns
columns *column_a*, *column_b*, *column_c* and *column_d* and drops 2 columns *column_b* and *column_d* using drop columns.
>>> print frame.schema
[('column_a', ), ('column_b', ), ('column_c', ), ('column_d', )]
Eliminate columns *column_b* and *column_d*:
>>> frame.drop_columns(["column_b", "column_d"])
>>> print frame.schema
[('column_a', ), ('column_c', )]
Now the frame only has the columns *column_a* and *column_c*.
For further examples, see: ref:`example_frame.drop_columns`.
"""
if isinstance(columns, basestring):
columns = [columns]
if self._is_scala:
self._scala.dropColumns(self._tc.jutils.convert.to_scala_vector_string(columns))
else:
victim_indices = get_indices_for_selected_columns(self.schema, columns)
survivor_indices = [i for i in xrange(len(self.schema)) if i not in victim_indices]
filtered_schema = [self.schema[i] for i in survivor_indices]
def filter_fields(row):
return [row[i] for i in survivor_indices]
filtered_rdd = self.rdd.map(filter_fields)
self._frame = PythonFrame(filtered_rdd, filtered_schema)
def drop_duplicates(
self, unique_columns=None)
Modify the current frame, removing duplicate rows.
unique_columns | (Optional[List[str] or str]): | Column name(s) to identify duplicates. Default is the entire row is compared. |
Remove data rows which are the same as other rows. The entire row can be checked for duplication, or the search for duplicates can be limited to one or more columns. This modifies the current frame.
Given a frame with data:
>>> frame.inspect()
[#] a b c
===============
[0] 200 4 25
[1] 200 5 25
[2] 200 4 25
[3] 200 5 35
[4] 200 6 25
[5] 200 8 35
[6] 200 4 45
[7] 200 4 25
[8] 200 5 25
[9] 201 4 25
Remove any rows that are identical to a previous row. The result is a frame of unique rows. Note that row order may change.
>>> frame.drop_duplicates()
[===Job Progress===]
>>> frame.inspect()
[#] a b c
===============
[0] 200 8 35
[1] 200 6 25
[2] 200 5 35
[3] 200 4 45
[4] 200 4 25
[5] 200 5 25
[6] 201 4 25
Now remove any rows that have the same data in columns a and c as a previously checked row:
>>> frame.drop_duplicates([ "a", "c"])
[===Job Progress===]
The result is a frame with unique values for the combination of columns a and c.
>>> frame.inspect()
[#] a b c
===============
[0] 201 4 25
[1] 200 4 45
[2] 200 6 25
[3] 200 8 35
def drop_duplicates(self, unique_columns=None):
"""
Modify the current frame, removing duplicate rows.
Parameters
----------
:param unique_columns: (Optional[List[str] or str]) Column name(s) to identify duplicates. Default is the entire
row is compared.
Remove data rows which are the same as other rows.
The entire row can be checked for duplication, or the search for duplicates can be limited to one or more columns.
This modifies the current frame.
Examples
--------
Given a frame with data:
>>> frame.inspect()
[#] a b c
===============
[0] 200 4 25
[1] 200 5 25
[2] 200 4 25
[3] 200 5 35
[4] 200 6 25
[5] 200 8 35
[6] 200 4 45
[7] 200 4 25
[8] 200 5 25
[9] 201 4 25
Remove any rows that are identical to a previous row.
The result is a frame of unique rows.
Note that row order may change.
>>> frame.drop_duplicates()
[===Job Progress===]
>>> frame.inspect()
[#] a b c
===============
[0] 200 8 35
[1] 200 6 25
[2] 200 5 35
[3] 200 4 45
[4] 200 4 25
[5] 200 5 25
[6] 201 4 25
Now remove any rows that have the same data in columns *a* and
*c* as a previously checked row:
>>> frame.drop_duplicates([ "a", "c"])
[===Job Progress===]
The result is a frame with unique values for the combination of columns *a*
and *c*.
>>> frame.inspect()
[#] a b c
===============
[0] 201 4 25
[1] 200 4 45
[2] 200 6 25
[3] 200 8 35
"""
if unique_columns is not None and not isinstance(unique_columns, list):
unique_columns = [unique_columns]
if isinstance(unique_columns, list):
unique_columns = self._tc.jutils.convert.to_scala_vector_string(unique_columns)
self._scala.dropDuplicates(self._tc.jutils.convert.to_scala_option(unique_columns))
def drop_rows(
self, predicate)
Erase any row in the current frame which qualifies.
predicate | (UDF): | Function which evaluates a row to a boolean; rows that answer True are dropped from the frame. |
>>> frame = tc.frame.create([['Fred',39,16,'555-1234'],
... ['Susan',33,3,'555-0202'],
... ['Thurston',65,26,'555-4510'],
... ['Judy',44,14,'555-2183']],
... schema=[('name', str), ('age', int), ('tenure', int), ('phone', str)])
>>> frame.inspect()
[#] name age tenure phone
====================================
[0] Fred 39 16 555-1234
[1] Susan 33 3 555-0202
[2] Thurston 65 26 555-4510
[3] Judy 44 14 555-2183
>>> frame.drop_rows(lambda row: row.name[-1] == 'n') # drop people whose name ends in 'n'
>>> frame.inspect()
[#] name age tenure phone
================================
[0] Fred 39 16 555-1234
[1] Judy 44 14 555-2183
More information on a |UDF| can be found at :doc:/ds_apir
.
def drop_rows(self, predicate):
"""
Erase any row in the current frame which qualifies.
Parameters
----------
:param predicate: (UDF) Function which evaluates a row to a boolean; rows that answer True are dropped from
the frame.
Examples
--------
>>> frame = tc.frame.create([['Fred',39,16,'555-1234'],
... ['Susan',33,3,'555-0202'],
... ['Thurston',65,26,'555-4510'],
... ['Judy',44,14,'555-2183']],
... schema=[('name', str), ('age', int), ('tenure', int), ('phone', str)])
>>> frame.inspect()
[#] name age tenure phone
====================================
[0] Fred 39 16 555-1234
[1] Susan 33 3 555-0202
[2] Thurston 65 26 555-4510
[3] Judy 44 14 555-2183
>>> frame.drop_rows(lambda row: row.name[-1] == 'n') # drop people whose name ends in 'n'
>>> frame.inspect()
[#] name age tenure phone
================================
[0] Fred 39 16 555-1234
[1] Judy 44 14 555-2183
More information on a |UDF| can be found at :doc:`/ds_apir`.
"""
row = Row(self.schema)
def drop_rows_func(r):
row._set_data(r)
return not predicate(row)
self._python.rdd = self._python.rdd.filter(drop_rows_func)
def ecdf(
self, column)
Builds new frame with columns for data and distribution.
column | (str): | The name of the input column containing sample. |
Returns | (Frame): | A new Frame containing each distinct value in the sample and its corresponding ECDF value. |
Generates the :term:empirical cumulative distribution
for the input column.
Consider the following sample data set in frame 'frame' containing several numbers.
>>> frame.inspect()
[#] numbers
============
[0] 1
[1] 3
[2] 1
[3] 0
[4] 2
[5] 1
[6] 4
[7] 3
>>> ecdf_frame = frame.ecdf('numbers')
[===Job Progress===]
>>> ecdf_frame.inspect()
[#] numbers numbers_ecdf
==========================
[0] 0 0.125
[1] 1 0.5
[2] 2 0.625
[3] 3 0.875
[4] 4 1.0
def ecdf(self, column):
"""
Builds new frame with columns for data and distribution.
Parameters
----------
:param column: (str) The name of the input column containing sample.
:return: (Frame) A new Frame containing each distinct value in the sample and its corresponding ECDF value.
Generates the :term:`empirical cumulative distribution` for the input column.
Examples
--------
Consider the following sample data set in *frame* 'frame' containing several numbers.
>>> frame.inspect()
[#] numbers
============
[0] 1
[1] 3
[2] 1
[3] 0
[4] 2
[5] 1
[6] 4
[7] 3
>>> ecdf_frame = frame.ecdf('numbers')
[===Job Progress===]
>>> ecdf_frame.inspect()
[#] numbers numbers_ecdf
==========================
[0] 0 0.125
[1] 1 0.5
[2] 2 0.625
[3] 3 0.875
[4] 4 1.0
"""
from sparktk.frame.frame import Frame
return Frame(self._tc, self._scala.ecdf(column))
def entropy(
self, data_column, weights_column=None)
Calculate the Shannon entropy of a column.
data_column | (str): | The column whose entropy is to be calculated. |
weights_column | (Optional[str]): | The column that provides weights (frequencies) for the entropy calculation. Must contain numerical data. Default is using uniform weights of 1 for all items. |
Returns | (float): | Entropy. |
The data column is weighted via the weights column. All data elements of weight <= 0 are excluded from the calculation, as are all data elements whose weight is NaN or infinite. If there are no data elements with a finite weight greater than 0, the entropy is zero.
Consider the following sample data set in frame 'frame' containing several numbers.
>>> frame.inspect()
[#] data weight
=================
[0] 0 1
[1] 1 2
[2] 2 4
[3] 4 8
>>> entropy = frame.entropy("data", "weight")
[===Job Progress===]
>>> "%0.8f" % entropy
'1.13691659'
If we have more choices and weights, the computation is not as simple. An on-line search for "Shannon Entropy" will provide more detail.
Given a frame of coin flips, half heads and half tails, the entropy is simply ln(2):
>>> frame.inspect()
[#] data
=========
[0] H
[1] T
[2] H
[3] T
[4] H
[5] T
[6] H
[7] T
[8] H
[9] T
>>> entropy = frame.entropy("data")
[===Job Progress===]
>>> "%0.8f" % entropy
'0.69314718'
def entropy(self, data_column, weights_column=None):
"""
Calculate the Shannon entropy of a column.
Parameters
----------
:param data_column: (str) The column whose entropy is to be calculated.
:param weights_column: (Optional[str]) The column that provides weights (frequencies) for the entropy calculation.
Must contain numerical data. Default is using uniform weights of 1 for all items.
:return: (float) Entropy.
The data column is weighted via the weights column.
All data elements of weight <= 0 are excluded from the calculation, as are
all data elements whose weight is NaN or infinite.
If there are no data elements with a finite weight greater than 0,
the entropy is zero.
Examples
--------
Consider the following sample data set in *frame* 'frame' containing several numbers.
>>> frame.inspect()
[#] data weight
=================
[0] 0 1
[1] 1 2
[2] 2 4
[3] 4 8
>>> entropy = frame.entropy("data", "weight")
[===Job Progress===]
>>> "%0.8f" % entropy
'1.13691659'
If we have more choices and weights, the computation is not as simple.
An on-line search for "Shannon Entropy" will provide more detail.
Given a frame of coin flips, half heads and half tails, the entropy is simply ln(2):
>>> frame.inspect()
[#] data
=========
[0] H
[1] T
[2] H
[3] T
[4] H
[5] T
[6] H
[7] T
[8] H
[9] T
>>> entropy = frame.entropy("data")
[===Job Progress===]
>>> "%0.8f" % entropy
'0.69314718'
"""
return self._scala.entropy(data_column, self._tc.jutils.convert.to_scala_option(weights_column))
def export_to_csv(
self, file_name, separator=',')
Write current frame to disk as a CSV file
file_name | (str): | file destination |
separator | (str): | string to be used for delimiting the fields |
>>> frame = tc.frame.create([[1, 2, 3], [4, 5, 6]])
>>> frame.export_to_csv("sandbox/export_example.csv")
>>> frame2 = tc.frame.import_csv("sandbox/export_example.csv")
>>> frame2.inspect()
[#] C0 C1 C2
===============
[0] 1 2 3
[1] 4 5 6
def export_to_csv(self, file_name, separator=','):
"""
Write current frame to disk as a CSV file
Parameters
----------
:param file_name: (str) file destination
:param separator: (str) string to be used for delimiting the fields
Example
-------
>>> frame = tc.frame.create([[1, 2, 3], [4, 5, 6]])
>>> frame.export_to_csv("sandbox/export_example.csv")
>>> frame2 = tc.frame.import_csv("sandbox/export_example.csv")
>>> frame2.inspect()
[#] C0 C1 C2
===============
[0] 1 2 3
[1] 4 5 6
"""
self._scala.exportToCsv(file_name, separator)
def export_to_hbase(
self, table_name, key_column_name=None, family_name='familyColumn')
Write current frame to HBase table.
Table must exist in HBase.
table_name | (str): | The name of the HBase table that will contain the exported frame |
key_column_name | (Optional[str]): | The name of the column to be used as row key in hbase table |
family_name | (Optional[str]): | The family name of the HBase table that will contain the exported frame |
>>> data = [[1, 0.2, -2, 5], [2, 0.4, -1, 6], [3, 0.6, 0, 7], [4, 0.8, 1, 8]]
>>> schema = [('a', int), ('b', float),('c', int) ,('d', int)]
>>> my_frame = tc.frame.create(data, schema)
>>> my_frame.export_to_hbase("test_demo_hbase", family_name="test_family")
[===Job Progress===]
Verify exported frame in hbase
From bash shell
$hbase shell
hbase(main):001:0> list
You should see test_demo_hbase table.
Run hbase(main):001:0> scan 'test_demo_hbase' (to verify frame).
Output:
ROW COLUMN+CELL
0 column=test_family:a, timestamp=1464219662295, value=1
0 column=test_family:b, timestamp=1464219662295, value=0.2
0 column=test_family:c, timestamp=1464219662295, value=-2
0 column=test_family:d, timestamp=1464219662295, value=5
1 column=test_family:a, timestamp=1464219662295, value=2
1 column=test_family:b, timestamp=1464219662295, value=0.4
1 column=test_family:c, timestamp=1464219662295, value=-1
1 column=test_family:d, timestamp=1464219662295, value=6
2 column=test_family:a, timestamp=1464219662295, value=3
2 column=test_family:b, timestamp=1464219662295, value=0.6
2 column=test_family:c, timestamp=1464219662295, value=0
2 column=test_family:d, timestamp=1464219662295, value=7
3 column=test_family:a, timestamp=1464219662295, value=4
3 column=test_family:b, timestamp=1464219662295, value=0.8
3 column=test_family:c, timestamp=1464219662295, value=1
3 column=test_family:d, timestamp=1464219662295, value=8
4 row(s) in 0.1560 seconds
def export_to_hbase(self, table_name, key_column_name=None, family_name="familyColumn"):
"""
Write current frame to HBase table.
Table must exist in HBase.
Parameters
----------
:param table_name: (str) The name of the HBase table that will contain the exported frame
:param key_column_name: (Optional[str]) The name of the column to be used as row key in hbase table
:param family_name: (Optional[str]) The family name of the HBase table that will contain the exported frame
Example
-------
>>> data = [[1, 0.2, -2, 5], [2, 0.4, -1, 6], [3, 0.6, 0, 7], [4, 0.8, 1, 8]]
>>> schema = [('a', int), ('b', float),('c', int) ,('d', int)]
>>> my_frame = tc.frame.create(data, schema)
>>> my_frame.export_to_hbase("test_demo_hbase", family_name="test_family")
[===Job Progress===]
Verify exported frame in hbase
From bash shell
$hbase shell
hbase(main):001:0> list
You should see test_demo_hbase table.
Run hbase(main):001:0> scan 'test_demo_hbase' (to verify frame).
Output:
ROW COLUMN+CELL
0 column=test_family:a, timestamp=1464219662295, value=1
0 column=test_family:b, timestamp=1464219662295, value=0.2
0 column=test_family:c, timestamp=1464219662295, value=-2
0 column=test_family:d, timestamp=1464219662295, value=5
1 column=test_family:a, timestamp=1464219662295, value=2
1 column=test_family:b, timestamp=1464219662295, value=0.4
1 column=test_family:c, timestamp=1464219662295, value=-1
1 column=test_family:d, timestamp=1464219662295, value=6
2 column=test_family:a, timestamp=1464219662295, value=3
2 column=test_family:b, timestamp=1464219662295, value=0.6
2 column=test_family:c, timestamp=1464219662295, value=0
2 column=test_family:d, timestamp=1464219662295, value=7
3 column=test_family:a, timestamp=1464219662295, value=4
3 column=test_family:b, timestamp=1464219662295, value=0.8
3 column=test_family:c, timestamp=1464219662295, value=1
3 column=test_family:d, timestamp=1464219662295, value=8
4 row(s) in 0.1560 seconds
"""
if not isinstance(table_name, basestring):
raise ValueError("Unsupported 'table_name' parameter type. Expected string, but found %s." % type(table_name))
if not isinstance(family_name, basestring):
raise ValueError(
"Unsupported 'family_name' parameter type. Expected string, but found %s." % type(family_name))
self._scala.exportToHbase(table_name, self._tc.jutils.convert.to_scala_option(key_column_name), family_name)
def export_to_hive(
self, hive_table_name)
Write current frame to Hive table.
Table must not exist in Hive. Hive does not support case sensitive table names and columns names. Hence column names with uppercase letters will be converted to lower case by Hive.
hive_table_name | (str): | hive table name |
>>> data = [[1, 0.2, -2, 5], [2, 0.4, -1, 6], [3, 0.6, 0, 7], [4, 0.8, 1, 8]]
>>> schema = [('a', int), ('b', float),('c', int) ,('d', int)]
>>> my_frame = tc.frame.create(data, schema)
[===Job Progress===]
table_name: (string): table name. It will create new table with given name if it does not exists already.
>>> my_frame.export_to_hive("demo_test_hive")
[===Job Progress===]
Verify exported frame in hive
From bash shell
$hive
hive> show tables
You should see demo_test_hive table.
Run hive> select * from demo_test_hive; (to verify frame).
def export_to_hive(self, hive_table_name):
"""
Write current frame to Hive table.
Table must not exist in Hive. Hive does not support case sensitive table names and columns names.
Hence column names with uppercase letters will be converted to lower case by Hive.
Parameters
----------
:param hive_table_name: (str) hive table name
Example
--------
>>> data = [[1, 0.2, -2, 5], [2, 0.4, -1, 6], [3, 0.6, 0, 7], [4, 0.8, 1, 8]]
>>> schema = [('a', int), ('b', float),('c', int) ,('d', int)]
>>> my_frame = tc.frame.create(data, schema)
[===Job Progress===]
table_name: (string): table name. It will create new table with given name if it does not exists already.
>>> my_frame.export_to_hive("demo_test_hive")
[===Job Progress===]
Verify exported frame in hive
From bash shell
$hive
hive> show tables
You should see demo_test_hive table.
Run hive> select * from demo_test_hive; (to verify frame).
"""
self._scala.exportToHive(hive_table_name)
def export_to_jdbc(
self, connection_url, table_name)
Write current frame to JDBC table
connection_url | (str): | JDBC connection url to database server |
table_name | (str): | JDBC table name |
>>> from sparktk import TkContext
>>> c=TkContext(sc)
>>> data = [[1, 0.2, -2, 5], [2, 0.4, -1, 6], [3, 0.6, 0, 7], [4, 0.8, 1, 8]]
>>> schema = [('a', int), ('b', float),('c', int) ,('d', int)]
>>> my_frame = tc.frame.create(data, schema)
[===Job Progress===]
connection_url : (string) : "jdbc:{datasbase_type}://{host}/{database_name}
Sample connection string for postgres ex: jdbc:postgresql://localhost/postgres [standard connection string to connect to default 'postgres' database]
table_name: (string): table name. It will create new table with given name if it does not exists already.
>>> my_frame.export_to_jdbc("jdbc:postgresql://localhost/postgres", "demo_test")
[===Job Progress===]
Verify exported frame in postgres
From bash shell
$sudo -su ppostgres psql
postgres=#\d
You should see demo_test table.
Run postgres=#select * from demo_test (to verify frame).
def export_to_jdbc(self, connection_url, table_name):
"""
Write current frame to JDBC table
Parameters
----------
:param connection_url: (str) JDBC connection url to database server
:param table_name: (str) JDBC table name
Example
-------
>>> from sparktk import TkContext
>>> c=TkContext(sc)
>>> data = [[1, 0.2, -2, 5], [2, 0.4, -1, 6], [3, 0.6, 0, 7], [4, 0.8, 1, 8]]
>>> schema = [('a', int), ('b', float),('c', int) ,('d', int)]
>>> my_frame = tc.frame.create(data, schema)
[===Job Progress===]
connection_url : (string) : "jdbc:{datasbase_type}://{host}/{database_name}
Sample connection string for postgres
ex: jdbc:postgresql://localhost/postgres [standard connection string to connect to default 'postgres' database]
table_name: (string): table name. It will create new table with given name if it does not exists already.
>>> my_frame.export_to_jdbc("jdbc:postgresql://localhost/postgres", "demo_test")
[===Job Progress===]
Verify exported frame in postgres
From bash shell
$sudo -su ppostgres psql
postgres=#\d
You should see demo_test table.
Run postgres=#select * from demo_test (to verify frame).
"""
self._scala.exportToJdbc(connection_url, table_name)
def export_to_json(
self, path, count=0, offset=0)
Write current frame to HDFS in Json format.
path | (str): | The HDFS folder path where the files will be created. |
count | (Optional[int]): | The number of records you want. Default (0), or a non-positive value, is the whole frame. |
offset | (Optional[int]): | The number of rows to skip before exporting to the file. Default is zero (0). |
def export_to_json(self, path, count=0, offset=0):
"""
Write current frame to HDFS in Json format.
Parameters
----------
:param path: (str) The HDFS folder path where the files will be created.
:param count: (Optional[int]) The number of records you want. Default (0), or a non-positive value, is the
whole frame.
:param offset: (Optional[int]) The number of rows to skip before exporting to the file. Default is zero (0).
"""
self._scala.exportToJson(path, count, offset)
def filter(
self, predicate)
Select all rows which satisfy a predicate.
Modifies the current frame to save defined rows and delete everything else.
predicate | (UDF): | Function which evaluates a row to a boolean; rows that answer False are dropped from the frame. |
>>> frame = tc.frame.create([['Fred',39,16,'555-1234'],
... ['Susan',33,3,'555-0202'],
... ['Thurston',65,26,'555-4510'],
... ['Judy',44,14,'555-2183']],
... schema=[('name', str), ('age', int), ('tenure', int), ('phone', str)])
>>> frame.inspect()
[#] name age tenure phone
====================================
[0] Fred 39 16 555-1234
[1] Susan 33 3 555-0202
[2] Thurston 65 26 555-4510
[3] Judy 44 14 555-2183
>>> frame.filter(lambda row: row.tenure >= 15) # keep only people with 15 or more years tenure
>>> frame.inspect()
[#] name age tenure phone
====================================
[0] Fred 39 16 555-1234
[1] Thurston 65 26 555-4510
More information on a |UDF| can be found at :doc:/ds_apir
.
def filter(self, predicate):
"""
Select all rows which satisfy a predicate.
Modifies the current frame to save defined rows and delete everything
else.
Parameters
----------
:param predicate: (UDF) Function which evaluates a row to a boolean; rows that answer False are dropped
from the frame.
Examples
--------
>>> frame = tc.frame.create([['Fred',39,16,'555-1234'],
... ['Susan',33,3,'555-0202'],
... ['Thurston',65,26,'555-4510'],
... ['Judy',44,14,'555-2183']],
... schema=[('name', str), ('age', int), ('tenure', int), ('phone', str)])
>>> frame.inspect()
[#] name age tenure phone
====================================
[0] Fred 39 16 555-1234
[1] Susan 33 3 555-0202
[2] Thurston 65 26 555-4510
[3] Judy 44 14 555-2183
>>> frame.filter(lambda row: row.tenure >= 15) # keep only people with 15 or more years tenure
>>> frame.inspect()
[#] name age tenure phone
====================================
[0] Fred 39 16 555-1234
[1] Thurston 65 26 555-4510
More information on a |UDF| can be found at :doc:`/ds_apir`.
"""
row = Row(self.schema)
def filter_func(r):
row._set_data(r)
return predicate(row)
self._python.rdd = self._python.rdd.filter(filter_func)
def flatten_columns(
self, columns)
Spread data to multiple rows based on cell data.
columns | (str or tuple(str, str)): | The the name of the column to be flattened, or a tuple with the column name and delimiter string. The default delimiter is a comma (,). |
Splits cells in the specified columns into multiple rows according to a string delimiter. New rows are a full copy of the original row, but the specified columns only contain one value. The original row is deleted.
Given a data file:
1-solo,mono,single-green,yellow,red
2-duo,double-orange,black
The commands to bring the data into a frame, where it can be worked on:
>>> frame.inspect()
[#] a b c
==========================================
[0] 1 solo,mono,single green|yellow|red
[1] 2 duo,double orange|black
Now, spread out those sub-strings in column b and c by specifying the column names and delmiters:
>>> frame.flatten_columns([('b', ','), ('c', '|')])
[===Job Progress===]
Note that the delimiters parameter is optional, and if no delimiter is specified, the default is a comma (,). So, in the above example, the delimiter parameter for b could be omitted.
Check again:
>>> frame.inspect()
[#] a b c
======================
[0] 1 solo green
[1] 1 mono yellow
[2] 1 single red
[3] 2 duo orange
[4] 2 double black
Alternatively, we can flatten a single column b using the default comma delimiter:
>>> frame.flatten_columns('b')
[===Job Progress===]
Check again:
>>> frame.inspect()
[#] a b c
================================
[0] 1 solo green|yellow|red
[1] 1 mono green|yellow|red
[2] 1 single green|yellow|red
[3] 2 duo orange|black
[4] 2 double orange|black
def flatten_columns (self, columns):
"""
Spread data to multiple rows based on cell data.
Parameters
----------
:param columns: (str or tuple(str, str)) The the name of the column to be flattened, or a tuple with the column name and
delimiter string. The default delimiter is a comma (,).
Splits cells in the specified columns into multiple rows according to a string delimiter.
New rows are a full copy of the original row, but the specified columns only contain one value.
The original row is deleted.
Examples
--------
Given a data file:
1-solo,mono,single-green,yellow,red
2-duo,double-orange,black
The commands to bring the data into a frame, where it can be worked on:
>>> frame.inspect()
[#] a b c
==========================================
[0] 1 solo,mono,single green|yellow|red
[1] 2 duo,double orange|black
Now, spread out those sub-strings in column *b* and *c* by specifying the column names and delmiters:
>>> frame.flatten_columns([('b', ','), ('c', '|')])
[===Job Progress===]
Note that the delimiters parameter is optional, and if no delimiter is specified, the default
is a comma (,). So, in the above example, the delimiter parameter for *b* could be omitted.
Check again:
>>> frame.inspect()
[#] a b c
======================
[0] 1 solo green
[1] 1 mono yellow
[2] 1 single red
[3] 2 duo orange
[4] 2 double black
Alternatively, we can flatten a single column *b* using the default comma delimiter:
>>> frame.flatten_columns('b')
[===Job Progress===]
Check again:
>>> frame.inspect()
[#] a b c
================================
[0] 1 solo green|yellow|red
[1] 1 mono green|yellow|red
[2] 1 single green|yellow|red
[3] 2 duo orange|black
[4] 2 double orange|black
"""
if not isinstance(columns, list):
columns = [columns]
columns = [c if isinstance(c, tuple) else (c, None) for c in columns]
return self._scala.flattenColumns(self._tc.jutils.convert.to_scala_list_string_option_tuple(columns))
def group_by(
self, group_by_columns, *aggregations)
Create a summarized frame with aggregations (Avg, Count, Max, Min, Mean, Sum, Stdev, ...).
group_by_columns | (List[str]): | list of columns to group on |
aggregations | (dict): | Aggregation function based on entire row, and/or dictionaries (one or more) of { column name str : aggregation function(s) }. |
Returns | (Frame): | Summarized Frame |
Creates a new frame and returns a Frame object to access it.Takes a column or group of columns, finds the unique combination of values, and creates unique rows with these column values.The other columns are combined according to the aggregation argument(s).
Aggregation currently supports using the following functions:
* avg
* count
* count_distinct
* max
* min
* stdev
* sum
* var
* histogram()
- Column order is not guaranteed when columns are added
-
The column names created by aggregation functions in the new frame are the original column name appended with the '_' character and the aggregation function. For example, if the original field is a and the function is avg, the resultant column is named a_avg.
-
An aggregation argument of count results in a column named count.
- The aggregation function agg.count is the only full row aggregation function supported at this time.
Consider this frame:
>>> frame.inspect()
[#] a b c d e f g
========================================
[0] 1 alpha 3.0 small 1 3.0 9
[1] 1 bravo 5.0 medium 1 4.0 9
[2] 1 alpha 5.0 large 1 8.0 8
[3] 2 bravo 8.0 large 1 5.0 7
[4] 2 charlie 12.0 medium 1 6.0 6
[5] 2 bravo 7.0 small 1 8.0 5
[6] 2 bravo 12.0 large 1 6.0 4
Count the groups in column 'b'
>>> b_count = frame.group_by('b', tc.agg.count)
[===Job Progress===]
>>> b_count.inspect()
[#] b count
===================
[0] alpha 2
[1] charlie 1
[2] bravo 4
Group by columns 'a' and 'b' and compute the average for column 'c'
>>> avg1 = frame.group_by(['a', 'b'], {'c' : tc.agg.avg})
>>> avg1.inspect()
[#] a b c_AVG
======================
[0] 2 charlie 12.0
[1] 2 bravo 9.0
[2] 1 bravo 5.0
[3] 1 alpha 4.0
Group by column 'a' and make a bunch of calculations for the grouped columns 'f' and 'g'
>>> mix_frame = frame.group_by('a', tc.agg.count, {'f': [tc.agg.avg, tc.agg.sum, tc.agg.min], 'g': tc.agg.max})
>>> mix_frame.inspect()
[#] a count g_MAX f_AVG f_SUM f_MIN
=========================================
[0] 2 4 7 6.25 25.0 5.0
[1] 1 3 9 5.0 15.0 3.0
Group by with histogram. The histogram aggregation argument is configured with these parameters:
cutoffs | (List[int or float or long or double]): | An array of values containing bin cutoff points. Array can be list or tuple. If an array is provided, values must be progressively increasing. All bin boundaries must be included, so, with N bins, you need N+1 values. For example, |
cutoffs=[1, 5, 8, 12] # creates three bins:
# bin0 holds values [1 inclusive - 5 exclusive]
# bin1 holds values [5 inclusive - 8 exclusive]
# bin2 holds values [8 inclusive - 9 exclusive]
include_lowest | (Optional[bool]): | Specify how the boundary conditions are handled. ``True`` indicates that the lower bound of the bin is inclusive. ``False`` indicates that the upper bound is inclusive. Default is ``True``. |
strict_binning | (Optional(bool)): | Specify how values outside of the cutoffs array should be binned. If set to ``True``, each value less than cutoffs[0] or greater than cutoffs[-1] will be assigned a bin value of -1. If set to ``False``, values less than cutoffs[0] will be included in the first bin while values greater than cutoffs[-1] will be included in the final bin. |
>>> hist = frame.group_by('a', {'g': tc.agg.histogram([1, 5, 8, 9])})
>>> hist.inspect()
[#] a g_HISTOGRAM
=========================
[0] 2 [0.25, 0.75, 0.0]
[1] 1 [0.0, 0.0, 1.0]
>>> hist = frame.group_by('a', {'g': tc.agg.histogram([1, 5, 8, 9], False)})
>>> hist.inspect()
[#] a g_HISTOGRAM
=============================================
[0] 2 [0.5, 0.5, 0.0]
[1] 1 [0.0, 0.333333333333, 0.666666666667]
def group_by(self, group_by_columns, *aggregations):
"""
Create a summarized frame with aggregations (Avg, Count, Max, Min, Mean, Sum, Stdev, ...).
Parameters
----------
:param group_by_columns: (List[str]) list of columns to group on
:param aggregations: (dict) Aggregation function based on entire row, and/or dictionaries (one or more) of { column name str : aggregation function(s) }.
:return: (Frame) Summarized Frame
Creates a new frame and returns a Frame object to access it.Takes a column or group of columns, finds the unique combination of
values, and creates unique rows with these column values.The other columns are combined according to the aggregation argument(s).
Aggregation currently supports using the following functions:
* avg
* count
* count_distinct
* max
* min
* stdev
* sum
* var
* histogram()
Notes
-----
* Column order is not guaranteed when columns are added
* The column names created by aggregation functions in the new frame are the original column name appended
with the '_' character and the aggregation function. For example, if the original field is *a* and the
function is *avg*, the resultant column is named *a_avg*.
* An aggregation argument of *count* results in a column named *count*.
* The aggregation function *agg.count* is the only full row aggregation function supported at this time.
Examples
-------
Consider this frame:
>>> frame.inspect()
[#] a b c d e f g
========================================
[0] 1 alpha 3.0 small 1 3.0 9
[1] 1 bravo 5.0 medium 1 4.0 9
[2] 1 alpha 5.0 large 1 8.0 8
[3] 2 bravo 8.0 large 1 5.0 7
[4] 2 charlie 12.0 medium 1 6.0 6
[5] 2 bravo 7.0 small 1 8.0 5
[6] 2 bravo 12.0 large 1 6.0 4
Count the groups in column 'b'
>>> b_count = frame.group_by('b', tc.agg.count)
[===Job Progress===]
>>> b_count.inspect()
[#] b count
===================
[0] alpha 2
[1] charlie 1
[2] bravo 4
Group by columns 'a' and 'b' and compute the average for column 'c'
>>> avg1 = frame.group_by(['a', 'b'], {'c' : tc.agg.avg})
>>> avg1.inspect()
[#] a b c_AVG
======================
[0] 2 charlie 12.0
[1] 2 bravo 9.0
[2] 1 bravo 5.0
[3] 1 alpha 4.0
Group by column 'a' and make a bunch of calculations for the grouped columns 'f' and 'g'
>>> mix_frame = frame.group_by('a', tc.agg.count, {'f': [tc.agg.avg, tc.agg.sum, tc.agg.min], 'g': tc.agg.max})
>>> mix_frame.inspect()
[#] a count g_MAX f_AVG f_SUM f_MIN
=========================================
[0] 2 4 7 6.25 25.0 5.0
[1] 1 3 9 5.0 15.0 3.0
**Group by with histogram**. The histogram aggregation argument is configured with these parameters:
:param cutoffs: (List[int or float or long or double]) An array of values containing bin cutoff points.
Array can be list or tuple. If an array is provided, values must be progressively increasing. All bin
boundaries must be included, so, with N bins, you need N+1 values. For example,
cutoffs=[1, 5, 8, 12] # creates three bins:
# bin0 holds values [1 inclusive - 5 exclusive]
# bin1 holds values [5 inclusive - 8 exclusive]
# bin2 holds values [8 inclusive - 9 exclusive]
:param include_lowest: (Optional[bool]) Specify how the boundary conditions are handled. ``True``
indicates that the lower bound of the bin is inclusive. ``False`` indicates that the upper bound is
inclusive. Default is ``True``.
:param strict_binning: (Optional(bool)) Specify how values outside of the cutoffs array should be
binned. If set to ``True``, each value less than cutoffs[0] or greater than cutoffs[-1] will be
assigned a bin value of -1. If set to ``False``, values less than cutoffs[0] will be included in
the first bin while values greater than cutoffs[-1] will be included in the final bin.
Example
-------
>>> hist = frame.group_by('a', {'g': tc.agg.histogram([1, 5, 8, 9])})
>>> hist.inspect()
[#] a g_HISTOGRAM
=========================
[0] 2 [0.25, 0.75, 0.0]
[1] 1 [0.0, 0.0, 1.0]
>>> hist = frame.group_by('a', {'g': tc.agg.histogram([1, 5, 8, 9], False)})
>>> hist.inspect()
[#] a g_HISTOGRAM
=============================================
[0] 2 [0.5, 0.5, 0.0]
[1] 1 [0.0, 0.333333333333, 0.666666666667]
"""
if group_by_columns is None:
group_by_columns = []
elif isinstance(group_by_columns, basestring):
group_by_columns = [group_by_columns]
first_column_name = None
aggregation_list = [] # aggregationFunction : String, columnName : String, newColumnName
for arg in aggregations:
if arg == agg.count:
if not first_column_name:
# only make this call once, since it goes to http - TODO, ultimately should be handled server-side
first_column_name = self.column_names[0]
aggregation_list.append(
{'function': agg.count, 'column_name': first_column_name, 'new_column_name': "count"})
elif isinstance(arg, dict):
for key, value in arg.iteritems():
# leave the valid column check to the server
if isinstance(value, list) or isinstance(value, tuple):
for item in value:
if item not in agg:
raise ValueError(
"%s is not a valid aggregation function, like agg.max. Supported agg methods: %s" % (
item, agg))
aggregation_list.append(
{'function': item, 'column_name': key, 'new_column_name': "%s_%s" % (key, item)})
else:
aggregation_list.append(
{'function': value, 'column_name': key, 'new_column_name': "%s_%s" % (key, value)})
else:
raise TypeError(
"Bad type %s provided in aggregation arguments; expecting an aggregation function or a dictionary of column_name:[func]" % type(
arg))
scala_group_by_aggregation_args = []
for item in aggregation_list:
scala_group_by_aggregation_args.append(self._tc.jutils.convert.to_scala_group_by_aggregation_args(item))
from sparktk.frame.frame import Frame
return Frame(self._tc, self._scala.groupBy(self._tc.jutils.convert.to_scala_list_string(group_by_columns),
self._tc.jutils.convert.to_scala_list(scala_group_by_aggregation_args)))
def histogram(
self, column_name, num_bins=None, weight_column_name=None, bin_type='equalwidth')
Compute the histogram for a column in a frame.
The returned value is a Histogram object containing 3 lists one each for: the cutoff points of the bins, size of each bin, and density of each bin.
column_name | (str): | Name of column to be evaluated. |
num_bins | (Optional[int]): | Number of bins in histogram. Default is Square-root choice will be used (in other words math.floor(math.sqrt(frame.count())). |
weight_column_name | (Optional[str]): | Name of column containing weights. Default is all observations are weighted equally. |
bin_type | (str["equalwidth"|"equaldepth"]): | The type of binning algorithm to use: ["equalwidth"|"equaldepth"] Defaults is "equalwidth". |
Returns | (Histogram): | A Histogram object containing the result set.
The data returned is composed of multiple components: cutoffs : array of float A list containing the edges of each bin. hist : array of float A list containing count of the weighted observations found in each bin. density : array of float A list containing a decimal containing the percentage of observations found in the total set per bin. |
The num_bins parameter is considered to be the maximum permissible number of bins because the data may dictate fewer bins. With equal depth binning, for example, if the column to be binned has 10 elements with only 2 distinct values and the num_bins parameter is greater than 2, then the number of actual number of bins will only be 2. This is due to a restriction that elements with an identical value must belong to the same bin.
Consider the following sample data set:
>>> frame.inspect()
[#] a b
=========
[0] a 2
[1] b 7
[2] c 3
[3] d 9
[4] e 1
A simple call for 3 equal-width bins gives:
>>> hist = frame.histogram("b", num_bins=3)
>>> hist.cutoffs
[1.0, 3.6666666666666665, 6.333333333333333, 9.0]
>>> hist.hist
[3.0, 0.0, 2.0]
>>> hist.density
[0.6, 0.0, 0.4]
Switching to equal depth gives:
>>> hist = frame.histogram("b", num_bins=3, bin_type='equaldepth')
>>> hist.cutoffs
[1.0, 2.0, 7.0, 9.0]
>>> hist.hist
[1.0, 2.0, 2.0]
>>> hist.density
[0.2, 0.4, 0.4]
Plot hist as a bar chart using matplotlib:
>>> import matplotlib.pyplot as plt
>>> plt.bar(hist,cutoffs[:1], hist.hist, width=hist.cutoffs[1] - hist.cutoffs[0])
Plot hist as a bar chart using matplotlib:
>>> import matplotlib.pyplot as plt
>>> plt.bar(hist.cutoffs[:1], hist.hist, width=hist.cutoffs[1] -
... hist["cutoffs"][0])
def histogram(self, column_name, num_bins=None, weight_column_name=None, bin_type="equalwidth"):
"""
Compute the histogram for a column in a frame.
The returned value is a Histogram object containing 3 lists one each for:
the cutoff points of the bins, size of each bin, and density of each bin.
Parameters
----------
:param column_name: (str) Name of column to be evaluated.
:param num_bins: (Optional[int]) Number of bins in histogram.
Default is Square-root choice will be used
(in other words math.floor(math.sqrt(frame.count())).
:param weight_column_name: (Optional[str]) Name of column containing weights.
Default is all observations are weighted equally.
:param bin_type: (str["equalwidth"|"equaldepth"]) The type of binning algorithm to use:
["equalwidth"|"equaldepth"] Defaults is "equalwidth".
:return: (Histogram) A Histogram object containing the result set.
The data returned is composed of multiple components:
cutoffs : array of float
A list containing the edges of each bin.
hist : array of float
A list containing count of the weighted observations found in each bin.
density : array of float
A list containing a decimal containing the percentage of
observations found in the total set per bin.
Notes
-----
The num_bins parameter is considered to be the maximum permissible number
of bins because the data may dictate fewer bins.
With equal depth binning, for example, if the column to be binned has 10
elements with only 2 distinct values and the *num_bins* parameter is
greater than 2, then the number of actual number of bins will only be 2.
This is due to a restriction that elements with an identical value must
belong to the same bin.
Examples
--------
Consider the following sample data set:
>>> frame.inspect()
[#] a b
=========
[0] a 2
[1] b 7
[2] c 3
[3] d 9
[4] e 1
A simple call for 3 equal-width bins gives:
>>> hist = frame.histogram("b", num_bins=3)
>>> hist.cutoffs
[1.0, 3.6666666666666665, 6.333333333333333, 9.0]
>>> hist.hist
[3.0, 0.0, 2.0]
>>> hist.density
[0.6, 0.0, 0.4]
Switching to equal depth gives:
>>> hist = frame.histogram("b", num_bins=3, bin_type='equaldepth')
>>> hist.cutoffs
[1.0, 2.0, 7.0, 9.0]
>>> hist.hist
[1.0, 2.0, 2.0]
>>> hist.density
[0.2, 0.4, 0.4]
Plot hist as a bar chart using matplotlib:
>>> import matplotlib.pyplot as plt
>>> plt.bar(hist,cutoffs[:1], hist.hist, width=hist.cutoffs[1] - hist.cutoffs[0])
Plot hist as a bar chart using matplotlib:
>>> import matplotlib.pyplot as plt
>>> plt.bar(hist.cutoffs[:1], hist.hist, width=hist.cutoffs[1] -
... hist["cutoffs"][0])
"""
results = self._tc.jutils.convert.scala_map_to_python_with_iterable_values(self._scala.histogram(column_name,
self._tc.jutils.convert.to_scala_option(num_bins),
self._tc.jutils.convert.to_scala_option(weight_column_name),
bin_type))
return Histogram(**results)
def inspect(
self, n=10, offset=0, columns=None, wrap='inspect_settings', truncate='inspect_settings', round='inspect_settings', width='inspect_settings', margin='inspect_settings', with_types='inspect_settings')
Pretty-print of the frame data
Essentially returns a string, but technically returns a RowInspection object which renders a string.
The RowInspection object naturally converts to a str when needed, like when printed or when displayed
by python REPL (i.e. using the object's repr). If running in a script and want the inspect output
to be printed, then it must be explicitly printed, then print frame.inspect()
n | (Optional[int]): | The number of rows to print |
offset | (Optional[int]): | The number of rows to skip before printing. |
columns | (Optional[List[str]]): | Filter columns to be included. By default, all columns are included. |
wrap | (Optional[int or 'stripes']): | If set to 'stripes' then inspect prints rows in stripes; if set to an integer N, rows will be printed in clumps of N columns, where the columns are wrapped. |
truncate | (Optional[int]): | If set to integer N, all strings will be truncated to length N, including all tagged ellipses. |
round | (Optional[int]): | If set to integer N, all floating point numbers will be rounded and truncated to N digits. |
width | (Optional[int]): | If set to integer N, the print out will try to honor a max line width of N. |
margin | (Optional[int]): | Applies to 'stripes' mode only. If set to integer N, the margin for printing names in a stripe will be limited to N characters. |
with_types | (Optinoal[bool]): | If set to True, header will include the data_type of each column. |
Returns | (RowsInspection): | An object which naturally converts to a pretty-print string. |
To look at the first 4 rows of data in a frame:
>>> frame.inspect(4)
[#] animal name age weight
==================================
[0] human George 8 542.5
[1] human Ursula 6 495.0
[2] ape Ape 41 400.0
[3] elephant Shep 5 8630.0
For other examples, see :ref:example_frame.inspect
.
Note: if the frame data contains unicode characters, this method may raise a Unicode exception when running in an interactive REPL or otherwise which triggers the standard python repr(). To get around this problem, explicitly print the unicode of the returned object:
>>> print unicode(frame.inspect())
Global Settings
If not specified, the arguments that control formatting receive default values from 'sparktk.inspect_settings'. Make changes there to affect all calls to inspect.
>>> import sparktk
>>> sparktk.inspect_settings
wrap 20
truncate None
round None
width 80
margin None
with_types False
>>> sparktk.inspect_settings.width = 120 # changes inspect to use 120 width globally
>>> sparktk.inspect_settings.truncate = 16 # changes inspect to always truncate strings to 16 chars
>>> sparktk.inspect_settings
wrap 20
truncate 16
round None
width 120
margin None
with_types False
>>> sparktk.inspect_settings.width = None # return value back to default
>>> sparktk.inspect_settings
wrap 20
truncate 16
round None
width 80
margin None
with_types False
>>> sparktk.inspect_settings.reset() # set everything back to default
>>> sparktk.inspect_settings
wrap 20
truncate None
round None
width 80
margin None
with_types False
def inspect(self,
n=10,
offset=0,
columns=None,
wrap=inspect_settings._unspecified,
truncate=inspect_settings._unspecified,
round=inspect_settings._unspecified,
width=inspect_settings._unspecified,
margin=inspect_settings._unspecified,
with_types=inspect_settings._unspecified):
"""
Pretty-print of the frame data
Essentially returns a string, but technically returns a RowInspection object which renders a string.
The RowInspection object naturally converts to a str when needed, like when printed or when displayed
by python REPL (i.e. using the object's __repr__). If running in a script and want the inspect output
to be printed, then it must be explicitly printed, then `print frame.inspect()`
Parameters
----------
:param n: (Optional[int]) The number of rows to print
:param offset: (Optional[int]) The number of rows to skip before printing.
:param columns: (Optional[List[str]]) Filter columns to be included. By default, all columns are included.
:param wrap: (Optional[int or 'stripes']) If set to 'stripes' then inspect prints rows in stripes; if set to an
integer N, rows will be printed in clumps of N columns, where the columns are wrapped.
:param truncate: (Optional[int]) If set to integer N, all strings will be truncated to length N, including all
tagged ellipses.
:param round: (Optional[int]) If set to integer N, all floating point numbers will be rounded and truncated to
N digits.
:param width: (Optional[int]) If set to integer N, the print out will try to honor a max line width of N.
:param margin: (Optional[int]) Applies to 'stripes' mode only. If set to integer N, the margin for printing names
in a stripe will be limited to N characters.
:param with_types: (Optinoal[bool]) If set to True, header will include the data_type of each column.
:return: (RowsInspection) An object which naturally converts to a pretty-print string.
Examples
--------
To look at the first 4 rows of data in a frame:
>>> frame.inspect(4)
[#] animal name age weight
==================================
[0] human George 8 542.5
[1] human Ursula 6 495.0
[2] ape Ape 41 400.0
[3] elephant Shep 5 8630.0
# For other examples, see :ref:`example_frame.inspect`.
Note: if the frame data contains unicode characters, this method may raise a Unicode exception when
running in an interactive REPL or otherwise which triggers the standard python repr(). To get around
this problem, explicitly print the unicode of the returned object:
>>> print unicode(frame.inspect())
**Global Settings**
If not specified, the arguments that control formatting receive default values from
'sparktk.inspect_settings'. Make changes there to affect all calls to inspect.
>>> import sparktk
>>> sparktk.inspect_settings
wrap 20
truncate None
round None
width 80
margin None
with_types False
>>> sparktk.inspect_settings.width = 120 # changes inspect to use 120 width globally
>>> sparktk.inspect_settings.truncate = 16 # changes inspect to always truncate strings to 16 chars
>>> sparktk.inspect_settings
wrap 20
truncate 16
round None
width 120
margin None
with_types False
>>> sparktk.inspect_settings.width = None # return value back to default
>>> sparktk.inspect_settings
wrap 20
truncate 16
round None
width 80
margin None
with_types False
>>> sparktk.inspect_settings.reset() # set everything back to default
>>> sparktk.inspect_settings
wrap 20
truncate None
round None
width 80
margin None
with_types False
"""
from sparktk.frame.ops.take import take_rich
format_settings = inspect_settings.copy(wrap, truncate, round, width, margin, with_types)
result = take_rich(self, n, offset, columns)
return RowsInspection(result.data, result.schema, offset=offset, format_settings=format_settings)
def join_inner(
self, right, left_on, right_on=None, use_broadcast=None)
join_inner performs inner join operation on one or two frames, creating a new frame.
right | (Frame): | Another frame to join with |
left_on | (List[str]): | Names of the columns in the left frame used to match up the two frames. |
right_on | (Optional[List[str]]): | Names of the columns in the right frame used to match up the two frames. Default is the same as the left frame. |
use_broadcast | (Optional[str]): | If one of your tables is small enough to fit in the memory of a single machine, you can use a broadcast join. Specify that table to broadcast (left or right) to possibly improve performance. Default is None. |
:returns: (Frame) A new frame with the results of the join
Create a new frame from a SQL JOIN operation with another frame. The frame on the 'left' is the currently active frame. The frame on the 'right' is another frame. This method take column(s) in the left frame and matches its values with column(s) in the right frame. 'inner' join will only allow data in the resultant frame if both the left and right frames have the same value in the matching column(s).
When a column is named the same in both frames, it will result in two columns in the new frame. The column from the left frame (originally the current frame) will be copied and the column name will have the string "_L" added to it. The same thing will happen with the column from the right frame, except its name has the string "_R" appended. The order of columns after this method is called is not guaranteed.
It is recommended that you rename the columns to meaningful terms prior
to using the join
method.
Consider two frames: codes and colors
codes.inspect() [#] numbers ============ [0] 1 [1] 3 [2] 1 [3] 0 [4] 2 [5] 1 [6] 5 [7] 3
colors.inspect() [#] numbers color ==================== [0] 1 red [1] 2 yellow [2] 3 green [3] 4 blue
Inner join using hash joins.
j = codes.join_inner(colors, 'numbers') [===Job Progress===]
j.inspect() [#] numbers color ==================== [0] 1 red [1] 1 red [2] 1 red [3] 2 yellow [4] 3 green [5] 3 green
(The join adds an extra column *_R which is the join column from the right frame; it may be disregarded)
Consider two frames: country_codes_frame and country_names_frame
country_codes_frame.inspect() [#] country_code area_code test_str ====================================== [0] 1 354 a [1] 2 91 a [2] 2 100 b [3] 3 47 a [4] 4 968 c [5] 5 50 c
country_names_frame.inspect() [#] country_code country_name test_str ========================================= [0] 1 Iceland a [1] 1 Ice-land a [2] 2 India b [3] 3 Norway a [4] 4 Oman c [5] 6 Germany c
Join them on the 'country_code' and 'test_str' columns ('inner' join by default)
composite_join = country_codes_frame.join_inner(country_names_frame, ['country_code', 'test_str']) [===Job Progress===]
composite_join.inspect() [#] country_code area_code test_str country_name ==================================================== [0] 1 354 a Iceland [1] 1 354 a Ice-land [2] 2 100 b India [3] 3 47 a Norway [4] 4 968 c Oman
Inner join broadcasting the left table
j = codes.join_inner(colors, 'numbers',use_broadcast="left") [===Job Progress===]
j.inspect() [#] numbers color ==================== [0] 1 red [1] 1 red [2] 1 red [3] 2 yellow [4] 3 green [5] 3 green
composite_join_left = country_codes_frame.join_inner(country_names_frame, ['country_code', 'test_str'],use_broadcast="left") [===Job Progress===]
composite_join_left.inspect() [#] country_code area_code test_str country_name ==================================================== [0] 1 354 a Iceland [1] 1 354 a Ice-land [2] 2 100 b India [3] 3 47 a Norway [4] 4 968 c Oman
Inner join broadcasting right table
j = codes.join_inner(colors, 'numbers',use_broadcast="right") [===Job Progress===]
j.inspect() [#] numbers color ==================== [0] 1 red [1] 3 green [2] 1 red [3] 2 yellow [4] 1 red [5] 3 green
composite_join_right = country_codes_frame.join_inner(country_names_frame, ['country_code', 'test_str'],use_broadcast="right") [===Job Progress===]
composite_join_right.inspect() [#] country_code area_code test_str country_name ==================================================== [0] 1 354 a Iceland [1] 1 354 a Ice-land [2] 2 100 b India [3] 3 47 a Norway [4] 4 968 c Oman
def join_inner(self,
right,
left_on,
right_on=None,
use_broadcast=None):
"""
join_inner performs inner join operation on one or two frames, creating a new frame.
Parameters
----------
:param right: (Frame) Another frame to join with
:param left_on: (List[str]) Names of the columns in the left frame used to match up the two frames.
:param right_on: (Optional[List[str]]) Names of the columns in the right frame used to match up the two frames. Default is the same as the left frame.
:param use_broadcast: (Optional[str]) If one of your tables is small enough to fit in the memory of a single machine, you can use a broadcast join.
Specify that table to broadcast (left or right) to possibly improve performance. Default is None.
:returns: (Frame) A new frame with the results of the join
Create a new frame from a SQL JOIN operation with another frame.
The frame on the 'left' is the currently active frame.
The frame on the 'right' is another frame.
This method take column(s) in the left frame and matches its values
with column(s) in the right frame.
'inner' join will only allow data in the resultant frame if both the left and right frames have the same value
in the matching column(s).
Notes
-----
When a column is named the same in both frames, it will result in two
columns in the new frame.
The column from the *left* frame (originally the current frame) will be
copied and the column name will have the string "_L" added to it.
The same thing will happen with the column from the *right* frame,
except its name has the string "_R" appended. The order of columns
after this method is called is not guaranteed.
It is recommended that you rename the columns to meaningful terms prior
to using the ``join`` method.
Examples
--------
Consider two frames: codes and colors
>>> codes.inspect()
[#] numbers
============
[0] 1
[1] 3
[2] 1
[3] 0
[4] 2
[5] 1
[6] 5
[7] 3
>>> colors.inspect()
[#] numbers color
====================
[0] 1 red
[1] 2 yellow
[2] 3 green
[3] 4 blue
Inner join using hash joins.
>>> j = codes.join_inner(colors, 'numbers')
[===Job Progress===]
>>> j.inspect()
[#] numbers color
====================
[0] 1 red
[1] 1 red
[2] 1 red
[3] 2 yellow
[4] 3 green
[5] 3 green
(The join adds an extra column *_R which is the join column from the right frame; it may be disregarded)
Consider two frames: country_codes_frame and country_names_frame
>>> country_codes_frame.inspect()
[#] country_code area_code test_str
======================================
[0] 1 354 a
[1] 2 91 a
[2] 2 100 b
[3] 3 47 a
[4] 4 968 c
[5] 5 50 c
>>> country_names_frame.inspect()
[#] country_code country_name test_str
=========================================
[0] 1 Iceland a
[1] 1 Ice-land a
[2] 2 India b
[3] 3 Norway a
[4] 4 Oman c
[5] 6 Germany c
Join them on the 'country_code' and 'test_str' columns ('inner' join by default)
>>> composite_join = country_codes_frame.join_inner(country_names_frame, ['country_code', 'test_str'])
[===Job Progress===]
>>> composite_join.inspect()
[#] country_code area_code test_str country_name
====================================================
[0] 1 354 a Iceland
[1] 1 354 a Ice-land
[2] 2 100 b India
[3] 3 47 a Norway
[4] 4 968 c Oman
Inner join broadcasting the left table
>>> j = codes.join_inner(colors, 'numbers',use_broadcast="left")
[===Job Progress===]
>>> j.inspect()
[#] numbers color
====================
[0] 1 red
[1] 1 red
[2] 1 red
[3] 2 yellow
[4] 3 green
[5] 3 green
>>> composite_join_left = country_codes_frame.join_inner(country_names_frame, ['country_code', 'test_str'],use_broadcast="left")
[===Job Progress===]
>>> composite_join_left.inspect()
[#] country_code area_code test_str country_name
====================================================
[0] 1 354 a Iceland
[1] 1 354 a Ice-land
[2] 2 100 b India
[3] 3 47 a Norway
[4] 4 968 c Oman
Inner join broadcasting right table
>>> j = codes.join_inner(colors, 'numbers',use_broadcast="right")
[===Job Progress===]
>>> j.inspect()
[#] numbers color
====================
[0] 1 red
[1] 3 green
[2] 1 red
[3] 2 yellow
[4] 1 red
[5] 3 green
>>> composite_join_right = country_codes_frame.join_inner(country_names_frame, ['country_code', 'test_str'],use_broadcast="right")
[===Job Progress===]
>>> composite_join_right.inspect()
[#] country_code area_code test_str country_name
====================================================
[0] 1 354 a Iceland
[1] 1 354 a Ice-land
[2] 2 100 b India
[3] 3 47 a Norway
[4] 4 968 c Oman
"""
if left_on is None:
raise ValueError("Please provide column name on which join should be performed")
elif isinstance(left_on, basestring):
left_on = [left_on]
if right_on is None:
right_on = left_on
elif isinstance(right_on, basestring):
right_on = [right_on]
if len(left_on) != len(right_on):
raise ValueError("Please provide equal number of join columns")
from sparktk.frame.frame import Frame
return Frame(self._tc, self._scala.joinInner(right._scala,
self._tc.jutils.convert.to_scala_list_string(left_on),
self._tc.jutils.convert.to_scala_option(
self._tc.jutils.convert.to_scala_list_string(right_on)),
self._tc.jutils.convert.to_scala_option(use_broadcast)))
def join_left(
self, right, left_on, right_on=None, use_broadcast_right=False)
join_left performs left join(Left outer) operation on one or two frames, creating a new frame.
right | (Frame): | Another frame to join with |
left_on | (List[str]): | Names of the columns in the left frame used to match up the two frames. |
right_on | (Optional[List[str]]): | Names of the columns in the right frame used to match up the two frames. Default is the same as the left frame. |
use_broadcast_right | (bool): | If right table is small enough to fit in the memory of a single machine, you can set use_broadcast_right to True to possibly improve performance using broadcast join. Default is False. |
:returns: (Frame) A new frame with the results of the join
Create a new frame from a SQL JOIN operation with another frame. The frame on the 'left' is the currently active frame. The frame on the 'right' is another frame. This method take column(s) in the left frame and matches its values with column(s) in the right frame. 'left' join will allow any data in the resultant frame if it exists in the left frame, but will allow any data from the right frame if it has a value in its column(s) which matches the value in the left frame column(s).
When a column is named the same in both frames, it will result in two columns in the new frame. The column from the left frame (originally the current frame) will be copied and the column name will have the string "_L" added to it. The same thing will happen with the column from the right frame, except its name has the string "_R" appended. The order of columns after this method is called is not guaranteed.
It is recommended that you rename the columns to meaningful terms prior
to using the join
method.
Consider two frames: codes and colors
codes.inspect() [#] numbers ============ [0] 1 [1] 3 [2] 1 [3] 0 [4] 2 [5] 1 [6] 5 [7] 3
colors.inspect() [#] numbers color ==================== [0] 1 red [1] 2 yellow [2] 3 green [3] 4 blue
j_left = codes.join_left(colors, 'numbers') [===Job Progress===]
j_left.inspect() [#] numbers_L color ====================== [0] 0 None [1] 1 red [2] 1 red [3] 1 red [4] 2 yellow [5] 3 green [6] 3 green [7] 5 None
(The join adds an extra column *_R which is the join column from the right frame; it may be disregarded)
Consider two frames: country_codes_frame and country_names_frame
country_codes_frame.inspect() [#] country_code area_code test_str ====================================== [0] 1 354 a [1] 2 91 a [2] 2 100 b [3] 3 47 a [4] 4 968 c [5] 5 50 c
country_names_frame.inspect() [#] country_code country_name test_str ========================================= [0] 1 Iceland a [1] 1 Ice-land a [2] 2 India b [3] 3 Norway a [4] 4 Oman c [5] 6 Germany c
Join them on the 'country_code' and 'test_str' columns ('inner' join by default)
composite_join_left = country_codes_frame.join_left(country_names_frame, ['country_code', 'test_str']) [===Job Progress===]
composite_join_left.inspect() [#] country_code_L area_code test_str_L country_name ======================================================== [0] 1 354 a Iceland [1] 1 354 a Ice-land [2] 2 91 a None [3] 2 100 b India [4] 3 47 a Norway [5] 4 968 c Oman [6] 5 50 c None
Left join broadcasting right table
j_left = codes.join_left(colors, 'numbers', use_broadcast_right=True) [===Job Progress===]
j_left.inspect() [#] numbers_L color ====================== [0] 1 red [1] 3 green [2] 1 red [3] 0 None [4] 2 yellow [5] 1 red [6] 5 None [7] 3 green
composite_join_left = country_codes_frame.join_left(country_names_frame, ['country_code', 'test_str'], use_broadcast_right=True) [===Job Progress===]
composite_join_left.inspect() [#] country_code_L area_code test_str_L country_name ======================================================== [0] 1 354 a Iceland [1] 1 354 a Ice-land [2] 2 91 a None [3] 2 100 b India [4] 3 47 a Norway [5] 4 968 c Oman [6] 5 50 c None
def join_left(self,
right,
left_on,
right_on=None,
use_broadcast_right=False):
"""
join_left performs left join(Left outer) operation on one or two frames, creating a new frame.
Parameters
---------
:param right: (Frame) Another frame to join with
:param left_on: (List[str]) Names of the columns in the left frame used to match up the two frames.
:param right_on: (Optional[List[str]]) Names of the columns in the right frame used to match up the two frames. Default is the same as the left frame.
:param use_broadcast_right: (bool) If right table is small enough to fit in the memory of a single machine,
you can set use_broadcast_right to True to possibly improve performance using broadcast join. Default is False.
:returns: (Frame) A new frame with the results of the join
Create a new frame from a SQL JOIN operation with another frame.
The frame on the 'left' is the currently active frame.
The frame on the 'right' is another frame.
This method take column(s) in the left frame and matches its values
with column(s) in the right frame.
'left' join will allow any data in the resultant
frame if it exists in the left frame, but will allow any data from the
right frame if it has a value in its column(s) which matches the value in
the left frame column(s).
Notes
-----
When a column is named the same in both frames, it will result in two
columns in the new frame.
The column from the *left* frame (originally the current frame) will be
copied and the column name will have the string "_L" added to it.
The same thing will happen with the column from the *right* frame,
except its name has the string "_R" appended. The order of columns
after this method is called is not guaranteed.
It is recommended that you rename the columns to meaningful terms prior
to using the ``join`` method.
Examples
--------
Consider two frames: codes and colors
>>> codes.inspect()
[#] numbers
============
[0] 1
[1] 3
[2] 1
[3] 0
[4] 2
[5] 1
[6] 5
[7] 3
>>> colors.inspect()
[#] numbers color
====================
[0] 1 red
[1] 2 yellow
[2] 3 green
[3] 4 blue
>>> j_left = codes.join_left(colors, 'numbers')
[===Job Progress===]
>>> j_left.inspect()
[#] numbers_L color
======================
[0] 0 None
[1] 1 red
[2] 1 red
[3] 1 red
[4] 2 yellow
[5] 3 green
[6] 3 green
[7] 5 None
(The join adds an extra column *_R which is the join column from the right frame; it may be disregarded)
Consider two frames: country_codes_frame and country_names_frame
>>> country_codes_frame.inspect()
[#] country_code area_code test_str
======================================
[0] 1 354 a
[1] 2 91 a
[2] 2 100 b
[3] 3 47 a
[4] 4 968 c
[5] 5 50 c
>>> country_names_frame.inspect()
[#] country_code country_name test_str
=========================================
[0] 1 Iceland a
[1] 1 Ice-land a
[2] 2 India b
[3] 3 Norway a
[4] 4 Oman c
[5] 6 Germany c
Join them on the 'country_code' and 'test_str' columns ('inner' join by default)
>>> composite_join_left = country_codes_frame.join_left(country_names_frame, ['country_code', 'test_str'])
[===Job Progress===]
>>> composite_join_left.inspect()
[#] country_code_L area_code test_str_L country_name
========================================================
[0] 1 354 a Iceland
[1] 1 354 a Ice-land
[2] 2 91 a None
[3] 2 100 b India
[4] 3 47 a Norway
[5] 4 968 c Oman
[6] 5 50 c None
Left join broadcasting right table
>>> j_left = codes.join_left(colors, 'numbers', use_broadcast_right=True)
[===Job Progress===]
>>> j_left.inspect()
[#] numbers_L color
======================
[0] 1 red
[1] 3 green
[2] 1 red
[3] 0 None
[4] 2 yellow
[5] 1 red
[6] 5 None
[7] 3 green
>>> composite_join_left = country_codes_frame.join_left(country_names_frame, ['country_code', 'test_str'], use_broadcast_right=True)
[===Job Progress===]
>>> composite_join_left.inspect()
[#] country_code_L area_code test_str_L country_name
========================================================
[0] 1 354 a Iceland
[1] 1 354 a Ice-land
[2] 2 91 a None
[3] 2 100 b India
[4] 3 47 a Norway
[5] 4 968 c Oman
[6] 5 50 c None
"""
if left_on is None:
raise ValueError("Please provide column name on which join should be performed")
elif isinstance(left_on, basestring):
left_on = [left_on]
if right_on is None:
right_on = left_on
elif isinstance(right_on, basestring):
right_on = [right_on]
if len(left_on) != len(right_on):
raise ValueError("Please provide equal number of join columns")
from sparktk.frame.frame import Frame
return Frame(self._tc, self._scala.joinLeft(right._scala,
self._tc.jutils.convert.to_scala_list_string(left_on),
self._tc.jutils.convert.to_scala_option(
self._tc.jutils.convert.to_scala_list_string(right_on)),
use_broadcast_right))
def join_outer(
self, right, left_on, right_on=None)
join_outer performs outer join operation on one or two frames, creating a new frame.
right | (Frame): | Another frame to join with |
left_on | (List[str]): | Names of the columns in the left frame used to match up the two frames. |
right_on | (Optional[List[str]]): | Names of the columns in the right frame used to match up the two frames. Default is the same as the left frame. |
:returns: (Frame) A new frame with the results of the join
Create a new frame from a SQL JOIN operation with another frame. The frame on the 'left' is the currently active frame. The frame on the 'right' is another frame. This method take column(s) in the left frame and matches its values with column(s) in the right frame. The 'outer' join provides a frame with data from both frames where the left and right frames did not have the same value in the matching column(s).
When a column is named the same in both frames, it will result in two columns in the new frame. The column from the left frame (originally the current frame) will be copied and the column name will have the string "_L" added to it. The same thing will happen with the column from the right frame, except its name has the string "_R" appended. The order of columns after this method is called is not guaranteed.
It is recommended that you rename the columns to meaningful terms prior
to using the join
method.
Consider two frames: codes and colors
codes.inspect() [#] numbers ============ [0] 1 [1] 3 [2] 1 [3] 0 [4] 2 [5] 1 [6] 5 [7] 3
colors.inspect() [#] numbers color ==================== [0] 1 red [1] 2 yellow [2] 3 green [3] 4 blue
Join them on the 'numbers' column ('inner' join by default)
j_outer = codes.join_outer(colors, 'numbers') [===Job Progress===]
j_outer.inspect() [#] numbers_L color ====================== [0] 0 None [1] 1 red [2] 1 red [3] 1 red [4] 2 yellow [5] 3 green [6] 3 green [7] 4 blue [8] 5 None
(The join adds an extra column *_R which is the join column from the right frame; it may be disregarded)
Consider two frames: country_codes_frame and country_names_frame
country_codes_frame.inspect() [#] country_code area_code test_str ====================================== [0] 1 354 a [1] 2 91 a [2] 2 100 b [3] 3 47 a [4] 4 968 c [5] 5 50 c
country_names_frame.inspect() [#] country_code country_name test_str ========================================= [0] 1 Iceland a [1] 1 Ice-land a [2] 2 India b [3] 3 Norway a [4] 4 Oman c [5] 6 Germany c
Join them on the 'country_code' and 'test_str' columns ('inner' join by default)
composite_join_outer = country_codes_frame.join_outer(country_names_frame, ['country_code', 'test_str']) [===Job Progress===]
composite_join_outer.inspect() [#] country_code_L area_code test_str_L country_name ======================================================== [0] 6 None c Germany [1] 1 354 a Iceland [2] 1 354 a Ice-land [3] 2 91 a None [4] 2 100 b India [5] 3 47 a Norway [6] 4 968 c Oman [7] 5 50 c None
def join_outer(self,
right,
left_on,
right_on=None):
"""
join_outer performs outer join operation on one or two frames, creating a new frame.
Parameters
----------
:param right: (Frame) Another frame to join with
:param left_on: (List[str]) Names of the columns in the left frame used to match up the two frames.
:param right_on: (Optional[List[str]]) Names of the columns in the right frame used to match up the two frames. Default is the same as the left frame.
:returns: (Frame) A new frame with the results of the join
Create a new frame from a SQL JOIN operation with another frame.
The frame on the 'left' is the currently active frame.
The frame on the 'right' is another frame.
This method take column(s) in the left frame and matches its values
with column(s) in the right frame.
The 'outer' join provides a frame with data from both frames where
the left and right frames did not have the same value in the matching
column(s).
Notes
-----
When a column is named the same in both frames, it will result in two
columns in the new frame.
The column from the *left* frame (originally the current frame) will be
copied and the column name will have the string "_L" added to it.
The same thing will happen with the column from the *right* frame,
except its name has the string "_R" appended. The order of columns
after this method is called is not guaranteed.
It is recommended that you rename the columns to meaningful terms prior
to using the ``join`` method.
Examples
--------
Consider two frames: codes and colors
>>> codes.inspect()
[#] numbers
============
[0] 1
[1] 3
[2] 1
[3] 0
[4] 2
[5] 1
[6] 5
[7] 3
>>> colors.inspect()
[#] numbers color
====================
[0] 1 red
[1] 2 yellow
[2] 3 green
[3] 4 blue
Join them on the 'numbers' column ('inner' join by default)
>>> j_outer = codes.join_outer(colors, 'numbers')
[===Job Progress===]
>>> j_outer.inspect()
[#] numbers_L color
======================
[0] 0 None
[1] 1 red
[2] 1 red
[3] 1 red
[4] 2 yellow
[5] 3 green
[6] 3 green
[7] 4 blue
[8] 5 None
(The join adds an extra column *_R which is the join column from the right frame; it may be disregarded)
Consider two frames: country_codes_frame and country_names_frame
>>> country_codes_frame.inspect()
[#] country_code area_code test_str
======================================
[0] 1 354 a
[1] 2 91 a
[2] 2 100 b
[3] 3 47 a
[4] 4 968 c
[5] 5 50 c
>>> country_names_frame.inspect()
[#] country_code country_name test_str
=========================================
[0] 1 Iceland a
[1] 1 Ice-land a
[2] 2 India b
[3] 3 Norway a
[4] 4 Oman c
[5] 6 Germany c
Join them on the 'country_code' and 'test_str' columns ('inner' join by default)
>>> composite_join_outer = country_codes_frame.join_outer(country_names_frame, ['country_code', 'test_str'])
[===Job Progress===]
>>> composite_join_outer.inspect()
[#] country_code_L area_code test_str_L country_name
========================================================
[0] 6 None c Germany
[1] 1 354 a Iceland
[2] 1 354 a Ice-land
[3] 2 91 a None
[4] 2 100 b India
[5] 3 47 a Norway
[6] 4 968 c Oman
[7] 5 50 c None
"""
if left_on is None:
raise ValueError("Please provide column name on which join should be performed")
elif isinstance(left_on, basestring):
left_on = [left_on]
if right_on is None:
right_on = left_on
elif isinstance(right_on, basestring):
right_on = [right_on]
if len(left_on) != len(right_on):
raise ValueError("Please provide equal number of join columns")
from sparktk.frame.frame import Frame
return Frame(self._tc, self._scala.joinOuter(right._scala,
self._tc.jutils.convert.to_scala_list_string(left_on),
self._tc.jutils.convert.to_scala_option(
self._tc.jutils.convert.to_scala_list_string(right_on))))
def join_right(
self, right, left_on, right_on=None, use_broadcast_left=False)
join_right performs right join(right outer) operation on one or two frames, creating a new frame.
right | (Frame): | Another frame to join with |
left_on | (List[str]): | Names of the columns in the left frame used to match up the two frames. |
right_on | (Optional[List[str]])Names of the columns in the right frame used to match up the two frames. Default is the same as the left frame.
|
:returns: (Frame) A new frame with the results of the join
Create a new frame from a SQL JOIN operation with another frame. The frame on the 'left' is the currently active frame. The frame on the 'right' is another frame. This method take column(s) in the left frame and matches its values with column(s) in the right frame. 'right' join works similarly to join_left, except it keeps all the data from the right frame and only the data from the left frame when it matches.
When a column is named the same in both frames, it will result in two columns in the new frame. The column from the left frame (originally the current frame) will be copied and the column name will have the string "_L" added to it. The same thing will happen with the column from the right frame, except its name has the string "_R" appended. The order of columns after this method is called is not guaranteed.
It is recommended that you rename the columns to meaningful terms prior
to using the join
method.
Consider two frames: codes and colors
codes.inspect() [#] numbers ============ [0] 1 [1] 3 [2] 1 [3] 0 [4] 2 [5] 1 [6] 5 [7] 3
colors.inspect() [#] numbers color ==================== [0] 1 red [1] 2 yellow [2] 3 green [3] 4 blue
j_right = codes.join_right(colors, 'numbers') [===Job Progress===]
j_right.inspect() [#] numbers_R color ====================== [0] 1 red [1] 1 red [2] 1 red [3] 2 yellow [4] 3 green [5] 3 green [6] 4 blue
(The join adds an extra column *_R which is the join column from the right frame; it may be disregarded)
Consider two frames: country_codes_frame and country_names_frame
country_codes_frame.inspect() [#] country_code area_code test_str ====================================== [0] 1 354 a [1] 2 91 a [2] 2 100 b [3] 3 47 a [4] 4 968 c [5] 5 50 c
country_names_frame.inspect() [#] country_code country_name test_str ========================================= [0] 1 Iceland a [1] 1 Ice-land a [2] 2 India b [3] 3 Norway a [4] 4 Oman c [5] 6 Germany c
Join them on the 'country_code' and 'test_str' columns ('inner' join by default)
composite_join_right = country_codes_frame.join_right(country_names_frame, ['country_code', 'test_str']) [===Job Progress===]
composite_join_right.inspect() [#] area_code country_code_R country_name test_str_R ======================================================== [0] None 6 Germany c [1] 354 1 Iceland a [2] 354 1 Ice-land a [3] 100 2 India b [4] 47 3 Norway a [5] 968 4 Oman c
Right join broadcasting left table
j_right = codes.join_right(colors, 'numbers', use_broadcast_left=True) [===Job Progress===]
j_right.inspect() [#] numbers_R color ====================== [0] 1 red [1] 1 red [2] 1 red [3] 2 yellow [4] 3 green [5] 3 green [6] 4 blue
composite_join_right = country_codes_frame.join_right(country_names_frame, ['country_code', 'test_str'], use_broadcast_left=True) [===Job Progress===]
composite_join_right.inspect() [#] area_code country_code_R country_name test_str_R ======================================================== [0] 354 1 Iceland a [1] 354 1 Ice-land a [2] 100 2 India b [3] 47 3 Norway a [4] 968 4 Oman c [5] None 6 Germany c
def join_right(self,
right,
left_on,
right_on=None,
use_broadcast_left=False):
"""
join_right performs right join(right outer) operation on one or two frames, creating a new frame.
Parameters
----------
:param right: (Frame) Another frame to join with
:param left_on: (List[str]) Names of the columns in the left frame used to match up the two frames.
:param right_on: (Optional[List[str]])Names of the columns in the right frame used to match up the two frames. Default is the same as the left frame.
:param use_broadcast_left: (bool) If left table is small enough to fit in the memory of a single machine,
you can set use_broadcast_left to True to possibly improve performance using broadcast join. Default is False.
:returns: (Frame) A new frame with the results of the join
Create a new frame from a SQL JOIN operation with another frame.
The frame on the 'left' is the currently active frame.
The frame on the 'right' is another frame.
This method take column(s) in the left frame and matches its values
with column(s) in the right frame.
'right' join works similarly to join_left, except it keeps all the data
from the right frame and only the data from the left frame when it
matches.
Notes
-----
When a column is named the same in both frames, it will result in two
columns in the new frame.
The column from the *left* frame (originally the current frame) will be
copied and the column name will have the string "_L" added to it.
The same thing will happen with the column from the *right* frame,
except its name has the string "_R" appended. The order of columns
after this method is called is not guaranteed.
It is recommended that you rename the columns to meaningful terms prior
to using the ``join`` method.
Examples
--------
Consider two frames: codes and colors
>>> codes.inspect()
[#] numbers
============
[0] 1
[1] 3
[2] 1
[3] 0
[4] 2
[5] 1
[6] 5
[7] 3
>>> colors.inspect()
[#] numbers color
====================
[0] 1 red
[1] 2 yellow
[2] 3 green
[3] 4 blue
>>> j_right = codes.join_right(colors, 'numbers')
[===Job Progress===]
>>> j_right.inspect()
[#] numbers_R color
======================
[0] 1 red
[1] 1 red
[2] 1 red
[3] 2 yellow
[4] 3 green
[5] 3 green
[6] 4 blue
(The join adds an extra column *_R which is the join column from the right frame; it may be disregarded)
Consider two frames: country_codes_frame and country_names_frame
>>> country_codes_frame.inspect()
[#] country_code area_code test_str
======================================
[0] 1 354 a
[1] 2 91 a
[2] 2 100 b
[3] 3 47 a
[4] 4 968 c
[5] 5 50 c
>>> country_names_frame.inspect()
[#] country_code country_name test_str
=========================================
[0] 1 Iceland a
[1] 1 Ice-land a
[2] 2 India b
[3] 3 Norway a
[4] 4 Oman c
[5] 6 Germany c
Join them on the 'country_code' and 'test_str' columns ('inner' join by default)
>>> composite_join_right = country_codes_frame.join_right(country_names_frame, ['country_code', 'test_str'])
[===Job Progress===]
>>> composite_join_right.inspect()
[#] area_code country_code_R country_name test_str_R
========================================================
[0] None 6 Germany c
[1] 354 1 Iceland a
[2] 354 1 Ice-land a
[3] 100 2 India b
[4] 47 3 Norway a
[5] 968 4 Oman c
Right join broadcasting left table
>>> j_right = codes.join_right(colors, 'numbers', use_broadcast_left=True)
[===Job Progress===]
>>> j_right.inspect()
[#] numbers_R color
======================
[0] 1 red
[1] 1 red
[2] 1 red
[3] 2 yellow
[4] 3 green
[5] 3 green
[6] 4 blue
>>> composite_join_right = country_codes_frame.join_right(country_names_frame, ['country_code', 'test_str'], use_broadcast_left=True)
[===Job Progress===]
>>> composite_join_right.inspect()
[#] area_code country_code_R country_name test_str_R
========================================================
[0] 354 1 Iceland a
[1] 354 1 Ice-land a
[2] 100 2 India b
[3] 47 3 Norway a
[4] 968 4 Oman c
[5] None 6 Germany c
"""
if left_on is None:
raise ValueError("Please provide column name on which join should be performed")
elif isinstance(left_on, basestring):
left_on = [left_on]
if right_on is None:
right_on = left_on
elif isinstance(right_on, basestring):
right_on = [right_on]
if len(left_on) != len(right_on):
raise ValueError("Please provide equal number of join columns")
from sparktk.frame.frame import Frame
return Frame(self._tc, self._scala.joinRight(right._scala,
self._tc.jutils.convert.to_scala_list_string(left_on),
self._tc.jutils.convert.to_scala_option(
self._tc.jutils.convert.to_scala_list_string(right_on)),
use_broadcast_left))
def map_columns(
self, func, schema)
Create a new frame from the output of a UDF which over each row of the current frame.
- The row |UDF| ('func') must return a value in the same format as specified by the schema.
func | (UDF): | Function which takes the values in the row and produces a value, or collection of values, for the new cell(s). |
schema | (List[(str,type)]): | Schema for the column(s) being added. |
Given our frame, let's create a new frame with the name and a column with how many years the person has been over 18
>>> frame = tc.frame.create([['Fred',39,16,'555-1234'],
... ['Susan',33,3,'555-0202'],
... ['Thurston',65,26,'555-4510'],
... ['Judy',44,14,'555-2183']],
... schema=[('name', str), ('age', int), ('tenure', int), ('phone', str)])
>>> frame.inspect()
[#] name age tenure phone
====================================
[0] Fred 39 16 555-1234
[1] Susan 33 3 555-0202
[2] Thurston 65 26 555-4510
[3] Judy 44 14 555-2183
>>> adult = frame.map_columns(lambda row: [row.name, row.age - 18], [('name', str), ('adult_years', int)])
>>> adult.inspect()
[#] name adult_years
==========================
[0] Fred 21
[1] Susan 15
[2] Thurston 47
[3] Judy 26
Note that the function returns a list, and therefore the schema also needs to be a list.
It is not necessary to use lambda syntax, any function will do, as long as it takes a single row argument. We can also call other local functions within.
(see also the 'add_columns' frame operation)
def map_columns(self, func, schema):
"""
Create a new frame from the output of a UDF which over each row of the current frame.
Notes
-----
1. The row |UDF| ('func') must return a value in the same format as
specified by the schema.
Parameters
----------
:param func: (UDF) Function which takes the values in the row and produces a value, or collection of values, for the new cell(s).
:param schema: (List[(str,type)]) Schema for the column(s) being added.
Examples
--------
Given our frame, let's create a new frame with the name and a column with how many years the person has been over 18
>>> frame = tc.frame.create([['Fred',39,16,'555-1234'],
... ['Susan',33,3,'555-0202'],
... ['Thurston',65,26,'555-4510'],
... ['Judy',44,14,'555-2183']],
... schema=[('name', str), ('age', int), ('tenure', int), ('phone', str)])
>>> frame.inspect()
[#] name age tenure phone
====================================
[0] Fred 39 16 555-1234
[1] Susan 33 3 555-0202
[2] Thurston 65 26 555-4510
[3] Judy 44 14 555-2183
>>> adult = frame.map_columns(lambda row: [row.name, row.age - 18], [('name', str), ('adult_years', int)])
>>> adult.inspect()
[#] name adult_years
==========================
[0] Fred 21
[1] Susan 15
[2] Thurston 47
[3] Judy 26
Note that the function returns a list, and therefore the schema also needs to be a list.
It is not necessary to use lambda syntax, any function will do, as long as it takes a single row argument. We
can also call other local functions within.
(see also the 'add_columns' frame operation)
"""
schema_helper.validate(schema)
row = Row(self.schema)
def map_columns_func(r):
row._set_data(r)
return func(row)
if isinstance(schema, list):
rdd = self._python.rdd.map(lambda r: map_columns_func(r))
else:
rdd = self._python.rdd.map(lambda r: [map_columns_func(r)])
return self._tc.frame.create(rdd, schema)
def matrix_covariance_matrix(
self, matrix_column_name)
Compute the Covariance Matrix of matrices stored in a frame
matrix_column_name: | Name of the column to compute the covariance matrix on |
Returns | (Frame): | returns the frame with a new column storing the covariance matrix for the corresponding matrix |
Calculate the covariance matrix for each matrix in column 'matrix_column_name' of a frame using the following:
Element (i,j) of the covariance matrix for a given matrix X is computed as: ((Xi - Mi)(Xj - Mj)) where Mi is the mean
>>> from sparktk import dtypes
>>> data = [[1, [[1,2,3,5],[2,3,5,6],[4,6,7,3],[8,9,2,4]]]]
>>> schema = [('id', int),('pixeldata', dtypes.matrix)]
>>> my_frame = tc.frame.create(data, schema)
>>> my_frame.inspect()
[#] id pixeldata
============================
[0] 1 [[ 1. 2. 3. 5.]
[ 2. 3. 5. 6.]
[ 4. 6. 7. 3.]
[ 8. 9. 2. 4.]]
Compute the covariance matrix for the matrices in 'pixeldata' column of the frame
>>> my_frame.matrix_covariance_matrix('pixeldata')
A new column gets added to the existing frame storing the covariance matrix
>>> my_frame.inspect()
[#] id pixeldata
============================
[0] 1 [[ 1. 2. 3. 5.]
[ 2. 3. 5. 6.]
[ 4. 6. 7. 3.]
[ 8. 9. 2. 4.]]
<BLANKLINE>
[#] CovarianceMatrix_pixeldata
============================================================
[0] [[ 2.91666667 3. -1. -3.75 ]
[ 3. 3.33333333 -0.33333333 -5. ]
[ -1. -0.33333333 3.33333333 -1. ]
[ -3.75 -5. -1. 10.91666667]]
def matrix_covariance_matrix(self, matrix_column_name):
"""
Compute the Covariance Matrix of matrices stored in a frame
Parameters
----------
:param matrix_column_name: Name of the column to compute the covariance matrix on
:return: (Frame) returns the frame with a new column storing the covariance matrix for the corresponding matrix
Calculate the covariance matrix for each matrix in column 'matrix_column_name' of a frame using the following:
Element (i,j) of the covariance matrix for a given matrix X is computed as: ((Xi - Mi)(Xj - Mj))
where Mi is the mean
Examples
--------
>>> from sparktk import dtypes
>>> data = [[1, [[1,2,3,5],[2,3,5,6],[4,6,7,3],[8,9,2,4]]]]
>>> schema = [('id', int),('pixeldata', dtypes.matrix)]
>>> my_frame = tc.frame.create(data, schema)
>>> my_frame.inspect()
[#] id pixeldata
============================
[0] 1 [[ 1. 2. 3. 5.]
[ 2. 3. 5. 6.]
[ 4. 6. 7. 3.]
[ 8. 9. 2. 4.]]
Compute the covariance matrix for the matrices in 'pixeldata' column of the frame
>>> my_frame.matrix_covariance_matrix('pixeldata')
A new column gets added to the existing frame storing the covariance matrix
>>> my_frame.inspect()
[#] id pixeldata
============================
[0] 1 [[ 1. 2. 3. 5.]
[ 2. 3. 5. 6.]
[ 4. 6. 7. 3.]
[ 8. 9. 2. 4.]]
[#] CovarianceMatrix_pixeldata
============================================================
[0] [[ 2.91666667 3. -1. -3.75 ]
[ 3. 3.33333333 -0.33333333 -5. ]
[ -1. -0.33333333 3.33333333 -1. ]
[ -3.75 -5. -1. 10.91666667]]
"""
self._scala.matrixCovarianceMatrix(matrix_column_name)
def matrix_pca(
self, matrix_column_name, v_matrix_column_name)
Compute the Principal Component Analysis of a matrix
matrix_column_name: | Name of the column storing the matrices whose principal components are to be computed |
v_matrix_column_name: | Name of the column storing the V matrix |
Returns | (Frame): | returns the frame with new column storing the principal components for the corresponding matrix |
Calculate the Principal Components for each matrix in column 'matrix_column_name' using the V matrix
>>> from sparktk import dtypes
>>> data = [[1, [[1,2,3,5],[2,3,5,6],[4,6,7,3],[8,9,2,4]]]]
>>> schema = [('id', int),('pixeldata', dtypes.matrix)]
>>> my_frame = tc.frame.create(data, schema)
>>> my_frame.inspect()
[#] id pixeldata
============================
[0] 1 [[ 1. 2. 3. 5.]
[ 2. 3. 5. 6.]
[ 4. 6. 7. 3.]
[ 8. 9. 2. 4.]]
Compute the singular value decomposition for the matrices in 'pixeldata' column of the frame
>>> my_frame.matrix_svd('pixeldata')
Three new columns get added storing the U matrix, V matrix and Singular Vectors
>>> my_frame.inspect()
[#] id pixeldata
============================
[0] 1 [[ 1. 2. 3. 5.]
[ 2. 3. 5. 6.]
[ 4. 6. 7. 3.]
[ 8. 9. 2. 4.]]
<BLANKLINE>
[#] U_pixeldata
========================================================
[0] [[-0.29128979 -0.43716238 -0.44530839 0.72507913]
[-0.42474933 -0.55066945 -0.26749936 -0.66692972]
[-0.55099141 -0.16785045 0.79986267 0.16868433]
[-0.65661765 0.69099814 -0.30060644 -0.0317899 ]]
<BLANKLINE>
[#] V_pixeldata
========================================================
[0] [[-0.47195872 0.50289367 -0.05244699 -0.72222035]
[-0.60780067 0.40702574 0.11313693 0.67239008]
[-0.44835972 -0.58469285 0.65644993 -0.16180641]
[-0.45476024 -0.48945099 -0.74399115 0.01039344]]
<BLANKLINE>
[#] SingularVectors_pixeldata
============================================================
[0] [[ 18.21704938 6.59797925 3.54086993 0.26080987]]
Compute the principal components using the V matrices computed for matrices in 'pixeldata'
>>> my_frame.matrix_pca('pixeldata', 'V_pixeldata')
A new column gets added storing the Principal components matrix
>>> my_frame.inspect()
[#] id pixeldata
============================
[0] 1 [[ 1. 2. 3. 5.]
[ 2. 3. 5. 6.]
[ 4. 6. 7. 3.]
[ 8. 9. 2. 4.]]
<BLANKLINE>
[#] U_pixeldata
========================================================
[0] [[-0.29128979 -0.43716238 -0.44530839 0.72507913]
[-0.42474933 -0.55066945 -0.26749936 -0.66692972]
[-0.55099141 -0.16785045 0.79986267 0.16868433]
[-0.65661765 0.69099814 -0.30060644 -0.0317899 ]]
<BLANKLINE>
[#] V_pixeldata
========================================================
[0] [[-0.47195872 0.50289367 -0.05244699 -0.72222035]
[-0.60780067 0.40702574 0.11313693 0.67239008]
[-0.44835972 -0.58469285 0.65644993 -0.16180641]
[-0.45476024 -0.48945099 -0.74399115 0.01039344]]
<BLANKLINE>
[#] SingularVectors_pixeldata
============================================================
[0] [[ 18.21704938 6.59797925 3.54086993 0.26080987]]
<BLANKLINE>
[#] PrincipalComponents_pixeldata
========================================================
[0] [[-0.47195872 1.00578734 -0.15734098 -3.61110176]
[-1.21560134 1.22107722 0.56568466 4.0343405 ]
[-1.79343888 -3.50815713 4.59514953 -0.48541923]
[-3.63808191 -4.40505888 -1.4879823 0.04157377]]
def matrix_pca(self, matrix_column_name, v_matrix_column_name):
"""
Compute the Principal Component Analysis of a matrix
Parameters
----------
:param matrix_column_name: Name of the column storing the matrices whose principal components are to be computed
:param v_matrix_column_name: Name of the column storing the V matrix
:return: (Frame) returns the frame with new column storing the principal components for the corresponding matrix
Calculate the Principal Components for each matrix in column 'matrix_column_name' using the V matrix
Examples
--------
>>> from sparktk import dtypes
>>> data = [[1, [[1,2,3,5],[2,3,5,6],[4,6,7,3],[8,9,2,4]]]]
>>> schema = [('id', int),('pixeldata', dtypes.matrix)]
>>> my_frame = tc.frame.create(data, schema)
>>> my_frame.inspect()
[#] id pixeldata
============================
[0] 1 [[ 1. 2. 3. 5.]
[ 2. 3. 5. 6.]
[ 4. 6. 7. 3.]
[ 8. 9. 2. 4.]]
Compute the singular value decomposition for the matrices in 'pixeldata' column of the frame
>>> my_frame.matrix_svd('pixeldata')
Three new columns get added storing the U matrix, V matrix and Singular Vectors
>>> my_frame.inspect()
[#] id pixeldata
============================
[0] 1 [[ 1. 2. 3. 5.]
[ 2. 3. 5. 6.]
[ 4. 6. 7. 3.]
[ 8. 9. 2. 4.]]
[#] U_pixeldata
========================================================
[0] [[-0.29128979 -0.43716238 -0.44530839 0.72507913]
[-0.42474933 -0.55066945 -0.26749936 -0.66692972]
[-0.55099141 -0.16785045 0.79986267 0.16868433]
[-0.65661765 0.69099814 -0.30060644 -0.0317899 ]]
[#] V_pixeldata
========================================================
[0] [[-0.47195872 0.50289367 -0.05244699 -0.72222035]
[-0.60780067 0.40702574 0.11313693 0.67239008]
[-0.44835972 -0.58469285 0.65644993 -0.16180641]
[-0.45476024 -0.48945099 -0.74399115 0.01039344]]
[#] SingularVectors_pixeldata
============================================================
[0] [[ 18.21704938 6.59797925 3.54086993 0.26080987]]
Compute the principal components using the V matrices computed for matrices in 'pixeldata'
>>> my_frame.matrix_pca('pixeldata', 'V_pixeldata')
A new column gets added storing the Principal components matrix
>>> my_frame.inspect()
[#] id pixeldata
============================
[0] 1 [[ 1. 2. 3. 5.]
[ 2. 3. 5. 6.]
[ 4. 6. 7. 3.]
[ 8. 9. 2. 4.]]
[#] U_pixeldata
========================================================
[0] [[-0.29128979 -0.43716238 -0.44530839 0.72507913]
[-0.42474933 -0.55066945 -0.26749936 -0.66692972]
[-0.55099141 -0.16785045 0.79986267 0.16868433]
[-0.65661765 0.69099814 -0.30060644 -0.0317899 ]]
[#] V_pixeldata
========================================================
[0] [[-0.47195872 0.50289367 -0.05244699 -0.72222035]
[-0.60780067 0.40702574 0.11313693 0.67239008]
[-0.44835972 -0.58469285 0.65644993 -0.16180641]
[-0.45476024 -0.48945099 -0.74399115 0.01039344]]
[#] SingularVectors_pixeldata
============================================================
[0] [[ 18.21704938 6.59797925 3.54086993 0.26080987]]
[#] PrincipalComponents_pixeldata
========================================================
[0] [[-0.47195872 1.00578734 -0.15734098 -3.61110176]
[-1.21560134 1.22107722 0.56568466 4.0343405 ]
[-1.79343888 -3.50815713 4.59514953 -0.48541923]
[-3.63808191 -4.40505888 -1.4879823 0.04157377]]
"""
self._scala.matrixPca(matrix_column_name, v_matrix_column_name)
def matrix_svd(
self, matrix_column_name)
Compute the Singular Value Decomposition of a matrix
matrix_column_name: | Name of the column to compute the svd on |
Returns | (Frame): | returns the frame with three new columns storing the U matrix, V matrix and Singular Vectors |
Calculate the Singular Value Decomposition for each matrix in column 'matrix_column_name'
>>> from sparktk import dtypes
>>> data = [[1, [[1,2,3,5],[2,3,5,6],[4,6,7,3],[8,9,2,4]]]]
>>> schema = [('id', int),('pixeldata', dtypes.matrix)]
>>> my_frame = tc.frame.create(data, schema)
>>> my_frame.inspect()
[#] id pixeldata
============================
[0] 1 [[ 1. 2. 3. 5.]
[ 2. 3. 5. 6.]
[ 4. 6. 7. 3.]
[ 8. 9. 2. 4.]]
Compute the singular value decomposition for the matrices in 'pixeldata' column of the frame
>>> my_frame.matrix_svd('pixeldata')
Three new columns get added storing the U matrix, V matrix and Singular Vectors
>>> my_frame.inspect()
[#] id pixeldata
============================
[0] 1 [[ 1. 2. 3. 5.]
[ 2. 3. 5. 6.]
[ 4. 6. 7. 3.]
[ 8. 9. 2. 4.]]
<BLANKLINE>
[#] U_pixeldata
========================================================
[0] [[-0.29128979 -0.43716238 -0.44530839 0.72507913]
[-0.42474933 -0.55066945 -0.26749936 -0.66692972]
[-0.55099141 -0.16785045 0.79986267 0.16868433]
[-0.65661765 0.69099814 -0.30060644 -0.0317899 ]]
<BLANKLINE>
[#] V_pixeldata
========================================================
[0] [[-0.47195872 0.50289367 -0.05244699 -0.72222035]
[-0.60780067 0.40702574 0.11313693 0.67239008]
[-0.44835972 -0.58469285 0.65644993 -0.16180641]
[-0.45476024 -0.48945099 -0.74399115 0.01039344]]
<BLANKLINE>
[#] SingularVectors_pixeldata
============================================================
[0] [[ 18.21704938 6.59797925 3.54086993 0.26080987]]
def matrix_svd(self, matrix_column_name):
"""
Compute the Singular Value Decomposition of a matrix
Parameters
----------
:param matrix_column_name: Name of the column to compute the svd on
:return: (Frame) returns the frame with three new columns storing the U matrix, V matrix and Singular Vectors
Calculate the Singular Value Decomposition for each matrix in column 'matrix_column_name'
Examples
--------
>>> from sparktk import dtypes
>>> data = [[1, [[1,2,3,5],[2,3,5,6],[4,6,7,3],[8,9,2,4]]]]
>>> schema = [('id', int),('pixeldata', dtypes.matrix)]
>>> my_frame = tc.frame.create(data, schema)
>>> my_frame.inspect()
[#] id pixeldata
============================
[0] 1 [[ 1. 2. 3. 5.]
[ 2. 3. 5. 6.]
[ 4. 6. 7. 3.]
[ 8. 9. 2. 4.]]
Compute the singular value decomposition for the matrices in 'pixeldata' column of the frame
>>> my_frame.matrix_svd('pixeldata')
Three new columns get added storing the U matrix, V matrix and Singular Vectors
>>> my_frame.inspect()
[#] id pixeldata
============================
[0] 1 [[ 1. 2. 3. 5.]
[ 2. 3. 5. 6.]
[ 4. 6. 7. 3.]
[ 8. 9. 2. 4.]]
[#] U_pixeldata
========================================================
[0] [[-0.29128979 -0.43716238 -0.44530839 0.72507913]
[-0.42474933 -0.55066945 -0.26749936 -0.66692972]
[-0.55099141 -0.16785045 0.79986267 0.16868433]
[-0.65661765 0.69099814 -0.30060644 -0.0317899 ]]
[#] V_pixeldata
========================================================
[0] [[-0.47195872 0.50289367 -0.05244699 -0.72222035]
[-0.60780067 0.40702574 0.11313693 0.67239008]
[-0.44835972 -0.58469285 0.65644993 -0.16180641]
[-0.45476024 -0.48945099 -0.74399115 0.01039344]]
[#] SingularVectors_pixeldata
============================================================
[0] [[ 18.21704938 6.59797925 3.54086993 0.26080987]]
"""
self._scala.matrixSvd(matrix_column_name)
def multiclass_classification_metrics(
self, label_column, pred_column, beta=1.0, frequency_column=None)
Statistics of accuracy, precision, and others for a multi-class classification model.
Parameters:
label_column | (str): | The name of the column containing the correct label for each instance. |
pred_column | (str): | The name of the column containing the predicted label for each instance. |
beta | (Optional[int]): | This is the beta value to use for :math:F_{ eta} measure (default F1 measure is computed);
must be greater than zero. Defaults is 1.
|
frequency_column | (Optional[str]): | The name of an optional column containing the frequency of observations. |
Returns | (ClassificationMetricsValue): | The data returned is composed of multiple components: <object>.accuracy : double <object>.confusion_matrix : table <object>.f_measure : double <object>.precision : double <object>.recall : double |
Calculate the accuracy, precision, confusion_matrix, recall and :math:F_{ eta}
measure for a
classification model.
-
The f_measure result is the :math:
F_{ eta}
measure for a classification model. The :math:F_{ eta}
measure of a binary classification model is the harmonic mean of precision and recall. If we let:- beta :math:
\equiv eta
, - :math:
T_{P}
denotes the number of true positives, - :math:
F_{P}
denotes the number of false positives, and - :math:
F_{N}
denotes the number of false negatives
then:
.. math::
F_{ eta} = (1 + eta ^ 2) * rac{ rac{T_{P}}{T_{P} + F_{P}} * rac{T_{P}}{T_{P} + F_{N}}}{ eta ^ 2 * rac{T_{P}}{T_{P} + F_{P}} + rac{T_{P}}{T_{P} + F_{N}}}
The :math:
F_{ eta}
measure for a multi-class classification model is computed as the weighted average of the :math:F_{ eta}
measure for each label, where the weight is the number of instances of each label. The determination of binary vs. multi-class is automatically inferred from the data. - beta :math:
-
For multi-class classification models, the recall measure is computed as the weighted average of the recall for each label, where the weight is the number of instances of each label. The determination of binary vs. multi-class is automatically inferred from the data.
-
For multi-class classification models, the precision measure is computed as the weighted average of the precision for each label, where the weight is the number of instances of each label. The determination of binary vs. multi-class is automatically inferred from the data.
-
The accuracy of a classification model is the proportion of predictions that are correctly identified. If we let :math:
T_{P}
denote the number of true positives, :math:T_{N}
denote the number of true negatives, and :math:K
denote the total number of classified instances, then the model accuracy is given by: :math:rac{T_{P} + T_{N}}{K}
. -
The confusion_matrix result is a confusion matrix for a classifier model, formatted for human readability.
Consider Frame my_frame, which contains the data
>>> my_frame.inspect()
[#] a b labels predictions
==================================
[0] red 1 0 0
[1] blue 3 1 0
[2] green 1 0 0
[3] green 0 1 1
[4] red 0 5 4
>>> cm = my_frame.multiclass_classification_metrics('labels', 'predictions')
[===Job Progress===]
>>> cm.f_measure
0.5866666666666667
>>> cm.recall
0.6
>>> cm.accuracy
0.6
>>> cm.precision
0.6666666666666666
>>> cm.confusion_matrix
Predicted_0 Predicted_1 Predicted_4
Actual_0 2 0 0
Actual_1 1 1 0
Actual_5 0 0 1
def multiclass_classification_metrics(self, label_column, pred_column, beta=1.0, frequency_column=None):
"""
Statistics of accuracy, precision, and others for a multi-class classification model.
Parameters:
:param label_column: (str) The name of the column containing the correct label for each instance.
:param pred_column: (str) The name of the column containing the predicted label for each instance.
:param beta: (Optional[int]) This is the beta value to use for :math:`F_{ \beta}` measure (default F1 measure is computed);
must be greater than zero. Defaults is 1.
:param frequency_column: (Optional[str]) The name of an optional column containing the frequency of observations.
:return: (ClassificationMetricsValue) The data returned is composed of multiple components:
<object>.accuracy : double
<object>.confusion_matrix : table
<object>.f_measure : double
<object>.precision : double
<object>.recall : double
Calculate the accuracy, precision, confusion_matrix, recall and :math:`F_{ \beta}` measure for a
classification model.
* The **f_measure** result is the :math:`F_{ \beta}` measure for a
classification model.
The :math:`F_{ \beta}` measure of a binary classification model is the
harmonic mean of precision and recall.
If we let:
* beta :math:`\equiv \beta`,
* :math:`T_{P}` denotes the number of true positives,
* :math:`F_{P}` denotes the number of false positives, and
* :math:`F_{N}` denotes the number of false negatives
then:
.. math::
F_{ \beta} = (1 + \beta ^ 2) * \frac{ \frac{T_{P}}{T_{P} + F_{P}} * \
\frac{T_{P}}{T_{P} + F_{N}}}{ \beta ^ 2 * \frac{T_{P}}{T_{P} + \
F_{P}} + \frac{T_{P}}{T_{P} + F_{N}}}
The :math:`F_{ \beta}` measure for a multi-class classification model is
computed as the weighted average of the :math:`F_{ \beta}` measure for
each label, where the weight is the number of instances of each label.
The determination of binary vs. multi-class is automatically inferred
from the data.
* For multi-class classification models, the **recall** measure is computed as
the weighted average of the recall for each label, where the weight is
the number of instances of each label.
The determination of binary vs. multi-class is automatically inferred
from the data.
* For multi-class classification models, the **precision** measure is computed
as the weighted average of the precision for each label, where the
weight is the number of instances of each label.
The determination of binary vs. multi-class is automatically inferred
from the data.
* The **accuracy** of a classification model is the proportion of
predictions that are correctly identified.
If we let :math:`T_{P}` denote the number of true positives,
:math:`T_{N}` denote the number of true negatives, and :math:`K` denote
the total number of classified instances, then the model accuracy is
given by: :math:`\frac{T_{P} + T_{N}}{K}`.
* The **confusion_matrix** result is a confusion matrix for a
classifier model, formatted for human readability.
Examples
--------
Consider Frame *my_frame*, which contains the data
>>> my_frame.inspect()
[#] a b labels predictions
==================================
[0] red 1 0 0
[1] blue 3 1 0
[2] green 1 0 0
[3] green 0 1 1
[4] red 0 5 4
>>> cm = my_frame.multiclass_classification_metrics('labels', 'predictions')
[===Job Progress===]
>>> cm.f_measure
0.5866666666666667
>>> cm.recall
0.6
>>> cm.accuracy
0.6
>>> cm.precision
0.6666666666666666
>>> cm.confusion_matrix
Predicted_0 Predicted_1 Predicted_4
Actual_0 2 0 0
Actual_1 1 1 0
Actual_5 0 0 1
"""
return ClassificationMetricsValue(self._tc, self._scala.multiClassClassificationMetrics(label_column,
pred_column,
float(beta),
self._tc.jutils.convert.to_scala_option(frequency_column)))
def power_iteration_clustering(
self, source_column, destination_column, similarity_column, k=2, max_iterations=100, initialization_mode='random')
Power Iteration Clustering finds a low-dimensional embedding of a dataset using truncated power iteration on a normalized pair-wise similarity matrix of the data.
source_column | (str): | Name of the column containing the source node |
destination_column | (str): | Name of the column containing the destination node |
similarity_column | (str): | Name of the column containing the similarity |
k | (Optional(int)): | Number of clusters to cluster the graph into. Default is 2 |
max_iterations | (Optional(int)): | Maximum number of iterations of the power iteration loop. Default is 100 |
initialization_mode | (Optional(str)): | Initialization mode of power iteration clustering. This can be either "random" to use a random vector as vertex properties, or "degree" to use normalized sum similarities. Default is "random". |
Returns | (namedtuple): | Returns namedtuple containing the results frame(node and cluster), k (number of clusters), and cluster_sizes(a map of clusters and respective size) |
>>> frame = tc.frame.create([[1,2,1.0],
... [1,3,0.3],
... [2,3,0.3],
... [3,0,0.03],
... [0,5,0.01],
... [5,4,0.3],
... [5,6,1.0],
... [4,6,0.3]],
... [('Source', int), ('Destination', int), ('Similarity',float)])
>>> frame.inspect()
[#] Source Destination Similarity
====================================
[0] 1 2 1.0
[1] 1 3 0.3
[2] 2 3 0.3
[3] 3 0 0.03
[4] 0 5 0.01
[5] 5 4 0.3
[6] 5 6 1.0
[7] 4 6 0.3
>>> x = frame.power_iteration_clustering('Source', 'Destination', 'Similarity', k=3)
>>> x.frame.inspect()
[#] id cluster
================
[0] 4 2
[1] 0 3
[2] 6 2
[3] 2 1
[4] 1 1
[5] 3 1
[6] 5 2
>>> x.k
3
>>> x.cluster_sizes
{u'2': 3, u'3': 1, u'1': 3}
def power_iteration_clustering(self, source_column, destination_column, similarity_column, k=2, max_iterations=100,
initialization_mode = "random"):
"""
Power Iteration Clustering finds a low-dimensional embedding of a dataset using truncated power iteration on a
normalized pair-wise similarity matrix of the data.
Parameters
----------
:param source_column: (str) Name of the column containing the source node
:param destination_column: (str) Name of the column containing the destination node
:param similarity_column: (str) Name of the column containing the similarity
:param k: (Optional(int)) Number of clusters to cluster the graph into. Default is 2
:param max_iterations: (Optional(int)) Maximum number of iterations of the power iteration loop. Default is 100
:param initialization_mode: (Optional(str)) Initialization mode of power iteration clustering. This can be either
"random" to use a random vector as vertex properties, or "degree" to use normalized sum similarities. Default is "random".
:return: (namedtuple) Returns namedtuple containing the results frame(node and cluster), k (number of clusters),
and cluster_sizes(a map of clusters and respective size)
Example
-------
>>> frame = tc.frame.create([[1,2,1.0],
... [1,3,0.3],
... [2,3,0.3],
... [3,0,0.03],
... [0,5,0.01],
... [5,4,0.3],
... [5,6,1.0],
... [4,6,0.3]],
... [('Source', int), ('Destination', int), ('Similarity',float)])
>>> frame.inspect()
[#] Source Destination Similarity
====================================
[0] 1 2 1.0
[1] 1 3 0.3
[2] 2 3 0.3
[3] 3 0 0.03
[4] 0 5 0.01
[5] 5 4 0.3
[6] 5 6 1.0
[7] 4 6 0.3
>>> x = frame.power_iteration_clustering('Source', 'Destination', 'Similarity', k=3)
>>> x.frame.inspect()
[#] id cluster
================
[0] 4 2
[1] 0 3
[2] 6 2
[3] 2 1
[4] 1 1
[5] 3 1
[6] 5 2
>>> x.k
3
>>> x.cluster_sizes
{u'2': 3, u'3': 1, u'1': 3}
"""
result = self._scala.powerIterationClustering(source_column,
destination_column,
similarity_column,
k,
max_iterations,
initialization_mode)
k_val = result.k()
cluster_sizes = self._tc.jutils.convert.scala_map_to_python(result.clusterSizes())
from sparktk.frame.frame import Frame
py_frame = Frame(self._tc, result.clusterMapFrame())
return PicResult(frame=py_frame, k=k_val, cluster_sizes=cluster_sizes)
def quantile_bin_column(
self, column_name, num_bins=None, bin_column_name=None)
Classify column into groups with the same frequency.
Group rows of data based on the value in a single column and add a label to identify grouping.
Equal depth binning attempts to label rows such that each bin contains the
same number of elements.
For :math:n
bins of a column :math:C
of length :math:m
, the bin
number is determined by:
.. math::
\lceil n * rac { f(C) }{ m }
ceil
where :math:f
is a tie-adjusted ranking function over values of
:math:C
.
If there are multiples of the same value in :math:C
, then their
tie-adjusted rank is the average of their ordered rank values.
- The num_bins parameter is considered to be the maximum permissible number
of bins because the data may dictate fewer bins.
For example, if the column to be binned has a quantity of :math"
X
elements with only 2 distinct values and the num_bins parameter is greater than 2, then the actual number of bins will only be 2. This is due to a restriction that elements with an identical value must belong to the same bin.
column_name | (str): | The column whose values are to be binned. |
num_bins | (Optional[int]): | The maximum number of quantiles. Default is the Square-root choice :math:`\lfloor \sqrt{m} floor`, where :math:`m` is the number of rows. |
bin_column_name | (Optional[str]): | The name for the new column holding the grouping labels.
Default is |
Returns | (List[float]): | A list containing the edges of each bin |
Given a frame with column a accessed by a Frame object my_frame:
>>> my_frame.inspect( n=11 )
[##] a
========
[0] 1
[1] 1
[2] 2
[3] 3
[4] 5
[5] 8
[6] 13
[7] 21
[8] 34
[9] 55
[10] 89
Modify the frame, adding a column showing what bin the data is in. The data should be grouped into a maximum of five bins. Note that each bin will have the same quantity of members (as much as possible):
>>> cutoffs = my_frame.quantile_bin_column('a', 5, 'aEDBinned')
[===Job Progress===]
>>> my_frame.inspect( n=11 )
[##] a aEDBinned
===================
[0] 1 0
[1] 1 0
[2] 2 1
[3] 3 1
[4] 5 2
[5] 8 2
[6] 13 3
[7] 21 3
[8] 34 4
[9] 55 4
[10] 89 4
>>> print cutoffs
[1.0, 2.0, 5.0, 13.0, 34.0, 89.0]
def quantile_bin_column(self, column_name, num_bins=None, bin_column_name=None):
"""
Classify column into groups with the same frequency.
Group rows of data based on the value in a single column and add a label
to identify grouping.
Equal depth binning attempts to label rows such that each bin contains the
same number of elements.
For :math:`n` bins of a column :math:`C` of length :math:`m`, the bin
number is determined by:
.. math::
\lceil n * \frac { f(C) }{ m } \rceil
where :math:`f` is a tie-adjusted ranking function over values of
:math:`C`.
If there are multiples of the same value in :math:`C`, then their
tie-adjusted rank is the average of their ordered rank values.
Notes
-----
1. The num_bins parameter is considered to be the maximum permissible number
of bins because the data may dictate fewer bins.
For example, if the column to be binned has a quantity of :math"`X`
elements with only 2 distinct values and the *num_bins* parameter is
greater than 2, then the actual number of bins will only be 2.
This is due to a restriction that elements with an identical value must
belong to the same bin.
Parameters
----------
:param column_name: (str) The column whose values are to be binned.
:param num_bins: (Optional[int]) The maximum number of quantiles.
Default is the Square-root choice
:math:`\lfloor \sqrt{m} \rfloor`, where :math:`m` is the number of rows.
:param bin_column_name: (Optional[str]) The name for the new column holding the grouping labels.
Default is _binned
:return: (List[float]) A list containing the edges of each bin
Examples
--------
Given a frame with column *a* accessed by a Frame object *my_frame*:
>>> my_frame.inspect( n=11 )
[##] a
========
[0] 1
[1] 1
[2] 2
[3] 3
[4] 5
[5] 8
[6] 13
[7] 21
[8] 34
[9] 55
[10] 89
Modify the frame, adding a column showing what bin the data is in.
The data should be grouped into a maximum of five bins.
Note that each bin will have the same quantity of members (as much as
possible):
>>> cutoffs = my_frame.quantile_bin_column('a', 5, 'aEDBinned')
[===Job Progress===]
>>> my_frame.inspect( n=11 )
[##] a aEDBinned
===================
[0] 1 0
[1] 1 0
[2] 2 1
[3] 3 1
[4] 5 2
[5] 8 2
[6] 13 3
[7] 21 3
[8] 34 4
[9] 55 4
[10] 89 4
>>> print cutoffs
[1.0, 2.0, 5.0, 13.0, 34.0, 89.0]
"""
return self._tc.jutils.convert.from_scala_seq(self._scala.quantileBinColumn(column_name,
self._tc.jutils.convert.to_scala_option(num_bins),
self._tc.jutils.convert.to_scala_option(bin_column_name)))
def quantiles(
self, column_name, quantiles)
Returns a new frame with Quantiles and their values.
column_name | (str): | The column to calculate quantiles on |
quantiles | (List[float]): | The quantiles being requested |
Returns | (Frame): | A new frame with two columns (float64): requested Quantiles and their respective values. |
Calculates quantiles on the given column.
Consider Frame my_frame, which accesses a frame that contains a single column final_sale_price:
>>> my_frame.inspect()
[#] final_sale_price
=====================
[0] 100
[1] 250
[2] 95
[3] 179
[4] 315
[5] 660
[6] 540
[7] 420
[8] 250
[9] 335
To calculate 10th, 50th, and 100th quantile:
>>> quantiles_frame = my_frame.quantiles('final_sale_price', [10, 50, 100])
[===Job Progress===]
A new Frame containing the requested Quantiles and their respective values will be returned:
quantiles_frame.inspect() [#] Quantiles final_sale_price_QuantileValue ============================================== [0] 10.0 95.0 [1] 50.0 250.0 [2] 100.0 660.0
def quantiles(self, column_name, quantiles):
"""
Returns a new frame with Quantiles and their values.
Parameters
----------
:param column_name: (str) The column to calculate quantiles on
:param quantiles: (List[float]) The quantiles being requested
:return: (Frame) A new frame with two columns (float64): requested Quantiles and their respective values.
Calculates quantiles on the given column.
Examples
--------
Consider Frame *my_frame*, which accesses a frame that contains a single
column *final_sale_price*:
>>> my_frame.inspect()
[#] final_sale_price
=====================
[0] 100
[1] 250
[2] 95
[3] 179
[4] 315
[5] 660
[6] 540
[7] 420
[8] 250
[9] 335
To calculate 10th, 50th, and 100th quantile:
>>> quantiles_frame = my_frame.quantiles('final_sale_price', [10, 50, 100])
[===Job Progress===]
A new Frame containing the requested Quantiles and their respective values
will be returned:
>>> quantiles_frame.inspect()
[#] Quantiles final_sale_price_QuantileValue
==============================================
[0] 10.0 95.0
[1] 50.0 250.0
[2] 100.0 660.0
"""
from sparktk.frame.frame import Frame
return Frame(self._tc, self._scala.quantiles(column_name, self._tc.jutils.convert.to_scala_list_double(quantiles)))
def rename_columns(
self, names)
Rename columns
names | (dict): | Dictionary of old names to new names. |
Start with a frame with columns Black and White.
>>> print my_frame.schema
[('Black', <type 'unicode'>), ('White', <type 'unicode'>)]
Rename the columns to Mercury and Venus:
>>> my_frame.rename_columns({"Black": "Mercury", "White": "Venus"})
>>> print my_frame.schema
[(u'Mercury', <type 'unicode'>), (u'Venus', <type 'unicode'>)]
def rename_columns(self, names):
"""
Rename columns
Parameters
----------
:param names: (dict) Dictionary of old names to new names.
Examples
--------
Start with a frame with columns *Black* and *White*.
>>> print my_frame.schema
[('Black', ), ('White', )]
Rename the columns to *Mercury* and *Venus*:
>>> my_frame.rename_columns({"Black": "Mercury", "White": "Venus"})
>>> print my_frame.schema
[(u'Mercury', ), (u'Venus', )]
"""
if not isinstance(names, dict):
raise ValueError("Unsupported 'names' parameter type. Expected dictionary, but found %s." % type(names))
if self.schema is None:
raise RuntimeError("Unable rename column(s), because the frame's schema has not been defined.")
if self._is_python:
scala_rename_map = self._tc.jutils.convert.to_scala_map(names)
scala_schema = schema_to_scala(self._tc.sc, self._python.schema)
rename_scala_schema = scala_schema.renameColumns(scala_rename_map)
self._python.schema = schema_to_python(self._tc.sc, rename_scala_schema)
else:
self._scala.renameColumns(self._tc.jutils.convert.to_scala_map(names))
def reverse_box_cox(
self, column_name, lambda_value=0.0, reverse_box_cox_column_name=None)
Calculate the reverse box-cox transformation for each row on a given column_name of the current frame
column_name: | Name of the column to perform the reverse transformation on |
lambda_value: | Lambda power parameter. Default is 0.0 |
reverse_box_cox_column_name: | Optional column name for the reverse box cox value |
Returns | (Frame): | returns a frame with a new column storing the reverse box-cox transformed value |
Calculate the reverse box-cox transformation for each row in column 'column_name' of a frame using the lambda_value.
Reverse Box-cox transformation is computed by the following formula:
reverse_box_cox = exp(boxcox); if lambda=0, reverse_box_cox = (lambda * boxcox + 1)^(1/lambda) ; else
>>> data = [[7.7132064326674596, 2.81913279907],[0.207519493594015, -1.25365381375],[6.336482349262754, 2.46673638752], [7.4880388253861181, 2.76469126003],[4.9850701230259045, 2.06401101556]]
>>> schema = [("input", float), ("input_lambda_0.3", float)]
>>> my_frame = tc.frame.create(data, schema)
>>> my_frame.inspect()
[#] input input_lambda_0.3
=====================================
[0] 7.71320643267 2.81913279907
[1] 0.207519493594 -1.25365381375
[2] 6.33648234926 2.46673638752
[3] 7.48803882539 2.76469126003
[4] 4.98507012303 2.06401101556
Compute the reverse box-cox transformation on the 'input_lambda_0.3' column which stores the box-cox transformed
value on column 'input' with lambda 0.3
>>> my_frame.reverse_box_cox('input_lambda_0.3',0.3)
A new column gets added to the frame which stores the reverse box-cox transformation for each row.
This value is equal to the original vales in 'input' column
>>> my_frame.inspect()
[#] input input_lambda_0.3 input_lambda_0.3_reverse_lambda_0.3
==========================================================================
[0] 7.71320643267 2.81913279907 7.71320643267
[1] 0.207519493594 -1.25365381375 0.207519493594
[2] 6.33648234926 2.46673638752 6.33648234926
[3] 7.48803882539 2.76469126003 7.4880388254
[4] 4.98507012303 2.06401101556 4.98507012301
def reverse_box_cox(self, column_name, lambda_value=0.0, reverse_box_cox_column_name= None):
"""
Calculate the reverse box-cox transformation for each row on a given column_name of the current frame
Parameters
----------
:param column_name: Name of the column to perform the reverse transformation on
:param lambda_value: Lambda power parameter. Default is 0.0
:param reverse_box_cox_column_name: Optional column name for the reverse box cox value
:return: (Frame) returns a frame with a new column storing the reverse box-cox transformed value
Calculate the reverse box-cox transformation for each row in column 'column_name' of a frame using the lambda_value.
Reverse Box-cox transformation is computed by the following formula:
reverse_box_cox = exp(boxcox); if lambda=0,
reverse_box_cox = (lambda * boxcox + 1)^(1/lambda) ; else
Examples
--------
>>> data = [[7.7132064326674596, 2.81913279907],[0.207519493594015, -1.25365381375],[6.336482349262754, 2.46673638752], [7.4880388253861181, 2.76469126003],[4.9850701230259045, 2.06401101556]]
>>> schema = [("input", float), ("input_lambda_0.3", float)]
>>> my_frame = tc.frame.create(data, schema)
>>> my_frame.inspect()
[#] input input_lambda_0.3
=====================================
[0] 7.71320643267 2.81913279907
[1] 0.207519493594 -1.25365381375
[2] 6.33648234926 2.46673638752
[3] 7.48803882539 2.76469126003
[4] 4.98507012303 2.06401101556
Compute the reverse box-cox transformation on the 'input_lambda_0.3' column which stores the box-cox transformed
value on column 'input' with lambda 0.3
>>> my_frame.reverse_box_cox('input_lambda_0.3',0.3)
A new column gets added to the frame which stores the reverse box-cox transformation for each row.
This value is equal to the original vales in 'input' column
>>> my_frame.inspect()
[#] input input_lambda_0.3 input_lambda_0.3_reverse_lambda_0.3
==========================================================================
[0] 7.71320643267 2.81913279907 7.71320643267
[1] 0.207519493594 -1.25365381375 0.207519493594
[2] 6.33648234926 2.46673638752 6.33648234926
[3] 7.48803882539 2.76469126003 7.4880388254
[4] 4.98507012303 2.06401101556 4.98507012301
"""
self._scala.reverseBoxCox(column_name, lambda_value, self._tc.jutils.convert.to_scala_option(reverse_box_cox_column_name))
def save(
self, path)
Persists the frame to the given file path
def save(self, path):
"""Persists the frame to the given file path"""
self._scala.save(path)
def sort(
self, columns, ascending=True)
Sort by one or more columns.
columns | (str or List[str]): | Either a column name, list of column names, or list of tuples where each tuple is a name and an ascending bool value. |
ascending | (Optional[bool]): | True for ascending (default), or False for descending. |
Sort a frame by column values either ascending or descending.
Consider the frame:
>>> frame.inspect()
[#] col1 col2
==================
[0] 3 foxtrot
[1] 1 charlie
[2] 3 bravo
[3] 2 echo
[4] 4 delta
[5] 3 alpha
Sort a single column:
>>> frame.sort('col1')
[===Job Progress===]
>>> frame.inspect()
[#] col1 col2
==================
[0] 1 charlie
[1] 2 echo
[2] 3 foxtrot
[3] 3 bravo
[4] 3 alpha
[5] 4 delta
Sort a single column descending:
>>> frame.sort('col2', False)
[===Job Progress===]
>>> frame.inspect()
[#] col1 col2
==================
[0] 3 foxtrot
[1] 2 echo
[2] 4 delta
[3] 1 charlie
[4] 3 bravo
[5] 3 alpha
Sort multiple columns:
>>> frame.sort(['col1', 'col2'])
[===Job Progress===]
>>> frame.inspect()
[#] col1 col2
==================
[0] 1 charlie
[1] 2 echo
[2] 3 alpha
[3] 3 bravo
[4] 3 foxtrot
[5] 4 delta
Sort multiple columns descending:
>>> frame.sort(['col1', 'col2'], False)
[===Job Progress===]
>>> frame.inspect()
[#] col1 col2
==================
[0] 4 delta
[1] 3 foxtrot
[2] 3 bravo
[3] 3 alpha
[4] 2 echo
[5] 1 charlie
Sort multiple columns: 'col1' decending and 'col2' ascending:
>>> frame.sort([ ('col1', False), ('col2', True) ])
[===Job Progress===]
>>> frame.inspect()
[#] col1 col2
==================
[0] 4 delta
[1] 3 alpha
[2] 3 bravo
[3] 3 foxtrot
[4] 2 echo
[5] 1 charlie
def sort(self, columns, ascending=True):
"""
Sort by one or more columns.
Parameters
----------
:param columns: (str or List[str]) Either a column name, list of column names, or list of tuples where each tuple is a name and an
ascending bool value.
:param ascending: (Optional[bool]) True for ascending (default), or False for descending.
Sort a frame by column values either ascending or descending.
Examples
--------
Consider the frame:
>>> frame.inspect()
[#] col1 col2
==================
[0] 3 foxtrot
[1] 1 charlie
[2] 3 bravo
[3] 2 echo
[4] 4 delta
[5] 3 alpha
Sort a single column:
>>> frame.sort('col1')
[===Job Progress===]
>>> frame.inspect()
[#] col1 col2
==================
[0] 1 charlie
[1] 2 echo
[2] 3 foxtrot
[3] 3 bravo
[4] 3 alpha
[5] 4 delta
Sort a single column descending:
>>> frame.sort('col2', False)
[===Job Progress===]
>>> frame.inspect()
[#] col1 col2
==================
[0] 3 foxtrot
[1] 2 echo
[2] 4 delta
[3] 1 charlie
[4] 3 bravo
[5] 3 alpha
Sort multiple columns:
>>> frame.sort(['col1', 'col2'])
[===Job Progress===]
>>> frame.inspect()
[#] col1 col2
==================
[0] 1 charlie
[1] 2 echo
[2] 3 alpha
[3] 3 bravo
[4] 3 foxtrot
[5] 4 delta
Sort multiple columns descending:
>>> frame.sort(['col1', 'col2'], False)
[===Job Progress===]
>>> frame.inspect()
[#] col1 col2
==================
[0] 4 delta
[1] 3 foxtrot
[2] 3 bravo
[3] 3 alpha
[4] 2 echo
[5] 1 charlie
Sort multiple columns: 'col1' decending and 'col2' ascending:
>>> frame.sort([ ('col1', False), ('col2', True) ])
[===Job Progress===]
>>> frame.inspect()
[#] col1 col2
==================
[0] 4 delta
[1] 3 alpha
[2] 3 bravo
[3] 3 foxtrot
[4] 2 echo
[5] 1 charlie
"""
if columns is None:
raise ValueError("The columns parameter should not be None.")
elif not isinstance(columns, list):
columns = [columns]
if not columns:
raise ValueError("The columns parameter should not be empty.")
if self._is_scala:
scala_sort(self, columns, ascending)
else:
column_names = columns # list of column names
columns_ascending = ascending # boolean summarizing if we are sorting ascending or descending
if isinstance(columns[0], tuple):
are_all_proper_tuples = all(isinstance(c, tuple) and isinstance(c[0], basestring) and isinstance(c[1], bool) for c in columns)
if not are_all_proper_tuples:
raise ValueError("If the columns paramter is a list of tuples, each tuple must have a string (column name)"
"and a bool (True for ascending).")
column_names = [c[0] for c in columns] # Grab just the column names from the list of tuples
# Check ascending booleans in the tuples to see if they're all the same
are_all_same_ascending = all(c[1] == columns[0][1] for c in columns)
if are_all_same_ascending:
columns_ascending = columns[0][1]
else:
are_all_same_ascending = True
if are_all_same_ascending:
indices = sparktk.frame.schema.get_indices_for_selected_columns(self.schema, column_names)
self._python.rdd = self.rdd.sortBy(lambda x: tuple([x[index] for index in indices]), ascending=columns_ascending)
else:
# If there are different ascending values between columns, then use scala sort
scala_sort(self, columns, ascending)
def sorted_k(
self, k, column_names_and_ascending, reduce_tree_depth=2)
Get a sorted subset of the data.
k | (int): | Number of sorted records to return. |
column_names_and_ascending | (List[tuple(str, bool)]): | Column names to sort by, and true to sort column by ascending order, or false for descending order. |
reduce_tree_depth | (int): | Advanced tuning parameter which determines the depth of the reduce-tree (uses Spark's treeReduce() for scalability.) Default is 2. |
Returns | (Frame): | A new frame with a subset of sorted rows from the original frame. |
Take a number of rows and return them sorted in either ascending or descending order.
Sorting a subset of rows is more efficient than sorting the entire frame when the number of sorted rows is much less than the total number of rows in the frame.
The number of sorted rows should be much smaller than the number of rows in the original frame.
In particular:
- The number of sorted rows returned should fit in Spark driver memory. The maximum size of serialized results that can fit in the Spark driver is set by the Spark configuration parameter spark.driver.maxResultSize.
- If you encounter a Kryo buffer overflow exception, increase the Spark configuration parameter spark.kryoserializer.buffer.max.mb.
- Use Frame.sort() instead if the number of sorted rows is very large (in other words, it cannot fit in Spark driver memory).
These examples deal with the most recently-released movies in a private collection. Consider the movie collection already stored in the frame below:
>>> my_frame.inspect()
[#] genre year title
========================================================
[0] Drama 1957 12 Angry Men
[1] Crime 1946 The Big Sleep
[2] Western 1969 Butch Cassidy and the Sundance Kid
[3] Drama 1971 A Clockwork Orange
[4] Drama 2008 The Dark Knight
[5] Animation 2013 Frozen
[6] Drama 1972 The Godfather
[7] Animation 1994 The Lion King
[8] Animation 2010 Tangled
[9] Fantasy 1939 The WOnderful Wizard of Oz
This example returns the top 3 rows sorted by a single column: 'year' descending:
>>> topk_frame = my_frame.sorted_k(3, [ ('year', False) ])
[===Job Progress===]
>>> topk_frame.inspect()
[#] genre year title
=====================================
[0] Animation 2013 Frozen
[1] Animation 2010 Tangled
[2] Drama 2008 The Dark Knight
This example returns the top 5 rows sorted by multiple columns: 'genre' ascending, then 'year' descending:
>>> topk_frame = my_frame.sorted_k(5, [ ('genre', True), ('year', False) ])
[===Job Progress===]
>>> topk_frame.inspect()
[#] genre year title
=====================================
[0] Animation 2013 Frozen
[1] Animation 2010 Tangled
[2] Animation 1994 The Lion King
[3] Crime 1946 The Big Sleep
[4] Drama 2008 The Dark Knight
This example returns the top 5 rows sorted by multiple columns: 'genre' ascending, then 'year' ascending. It also illustrates the optional tuning parameter for reduce-tree depth (which does not affect the final result).
>>> topk_frame = my_frame.sorted_k(5, [ ('genre', True), ('year', True) ], reduce_tree_depth=1)
[===Job Progress===]
>>> topk_frame.inspect()
[#] genre year title
===================================
[0] Animation 1994 The Lion King
[1] Animation 2010 Tangled
[2] Animation 2013 Frozen
[3] Crime 1946 The Big Sleep
[4] Drama 1957 12 Angry Men
def sorted_k(self, k, column_names_and_ascending, reduce_tree_depth = 2):
"""
Get a sorted subset of the data.
Parameters
----------
:param k: (int) Number of sorted records to return.
:param column_names_and_ascending: (List[tuple(str, bool)]) Column names to sort by, and true to sort column
by ascending order, or false for descending order.
:param reduce_tree_depth: (int) Advanced tuning parameter which determines the depth of the
reduce-tree (uses Spark's treeReduce() for scalability.)
Default is 2.
:return: (Frame) A new frame with a subset of sorted rows from the original frame.
Take a number of rows and return them sorted in either ascending or descending order.
Sorting a subset of rows is more efficient than sorting the entire frame when
the number of sorted rows is much less than the total number of rows in the frame.
Notes
-----
The number of sorted rows should be much smaller than the number of rows
in the original frame.
In particular:
1. The number of sorted rows returned should fit in Spark driver memory.
The maximum size of serialized results that can fit in the Spark driver is
set by the Spark configuration parameter *spark.driver.maxResultSize*.
+ If you encounter a Kryo buffer overflow exception, increase the Spark
configuration parameter *spark.kryoserializer.buffer.max.mb*.
+ Use Frame.sort() instead if the number of sorted rows is very large (in
other words, it cannot fit in Spark driver memory).
Examples
--------
These examples deal with the most recently-released movies in a private collection.
Consider the movie collection already stored in the frame below:
>>> my_frame.inspect()
[#] genre year title
========================================================
[0] Drama 1957 12 Angry Men
[1] Crime 1946 The Big Sleep
[2] Western 1969 Butch Cassidy and the Sundance Kid
[3] Drama 1971 A Clockwork Orange
[4] Drama 2008 The Dark Knight
[5] Animation 2013 Frozen
[6] Drama 1972 The Godfather
[7] Animation 1994 The Lion King
[8] Animation 2010 Tangled
[9] Fantasy 1939 The WOnderful Wizard of Oz
This example returns the top 3 rows sorted by a single column: 'year' descending:
>>> topk_frame = my_frame.sorted_k(3, [ ('year', False) ])
[===Job Progress===]
>>> topk_frame.inspect()
[#] genre year title
=====================================
[0] Animation 2013 Frozen
[1] Animation 2010 Tangled
[2] Drama 2008 The Dark Knight
This example returns the top 5 rows sorted by multiple columns: 'genre' ascending, then 'year' descending:
>>> topk_frame = my_frame.sorted_k(5, [ ('genre', True), ('year', False) ])
[===Job Progress===]
>>> topk_frame.inspect()
[#] genre year title
=====================================
[0] Animation 2013 Frozen
[1] Animation 2010 Tangled
[2] Animation 1994 The Lion King
[3] Crime 1946 The Big Sleep
[4] Drama 2008 The Dark Knight
This example returns the top 5 rows sorted by multiple columns: 'genre'
ascending, then 'year' ascending.
It also illustrates the optional tuning parameter for reduce-tree depth
(which does not affect the final result).
>>> topk_frame = my_frame.sorted_k(5, [ ('genre', True), ('year', True) ], reduce_tree_depth=1)
[===Job Progress===]
>>> topk_frame.inspect()
[#] genre year title
===================================
[0] Animation 1994 The Lion King
[1] Animation 2010 Tangled
[2] Animation 2013 Frozen
[3] Crime 1946 The Big Sleep
[4] Drama 1957 12 Angry Men
"""
from sparktk.frame.frame import Frame
return Frame(self._tc,
self._scala.sortedK(k,
self._tc.jutils.convert.to_scala_list_string_bool_tuple(column_names_and_ascending),
reduce_tree_depth))
def take(
self, n, offset=0, columns=None)
Get data subset.
Take a subset of the currently active Frame.
(See 'collect' operation to simply get all the data from the Frame)
n | (int): | The number of rows to get from the frame (warning: do not overwhelm the python session by taking too much) |
offset | (Optional[int]): | The number of rows to skip before starting to copy. |
columns | (Optional[str or list[str]): | If not None, only the given columns' data will be provided. By default, all columns are included. |
Returns | (list[list[data]]): | raw frame data |
Consider the following frame: >>> frame.inspect() [#] name age tenure phone ==================================== [0] Fred 39 16 555-1234 [1] Susan 33 3 555-0202 [2] Thurston 65 26 555-4510 [3] Judy 44 14 555-2183
Use take to get the first two rows and look at the schema and data in the result:
>>> frame.take(2)
[['Fred', 39, 16, '555-1234'], ['Susan', 33, 3, '555-0202']]
Limit the columns in our result to just the name and age column:
>>> frame.take(2, columns=['name', 'age'])
[['Fred', 39], ['Susan', 33]]
def take(self, n, offset=0, columns=None):
"""
Get data subset.
Take a subset of the currently active Frame.
(See 'collect' operation to simply get all the data from the Frame)
Parameters
----------
:param n: (int) The number of rows to get from the frame (warning: do not overwhelm the python session
by taking too much)
:param offset: (Optional[int]) The number of rows to skip before starting to copy.
:param columns: (Optional[str or list[str]) If not None, only the given columns' data will be provided.
By default, all columns are included.
:return: (list[list[data]]) raw frame data
Examples
--------
Consider the following frame:
>>> frame.inspect()
[#] name age tenure phone
====================================
[0] Fred 39 16 555-1234
[1] Susan 33 3 555-0202
[2] Thurston 65 26 555-4510
[3] Judy 44 14 555-2183
Use take to get the first two rows and look at the schema and data in the result:
>>> frame.take(2)
[['Fred', 39, 16, '555-1234'], ['Susan', 33, 3, '555-0202']]
Limit the columns in our result to just the name and age column:
>>> frame.take(2, columns=['name', 'age'])
[['Fred', 39], ['Susan', 33]]
"""
require_type.non_negative_int(n, "n")
require_type.non_negative_int(offset, "offset")
if columns is not None:
columns = affirm_type.list_of_str(columns, "columns")
if not columns:
return []
if self._is_scala:
scala_data = self._scala.take(n, offset, self._tc.jutils.convert.to_scala_option_list_string(columns))
schema = get_schema_for_columns(self.schema, columns) if columns else self.schema
data = TakeCollectHelper.scala_rows_to_python(self._tc, scala_data, schema)
else:
require_type.non_negative_int(n, "n")
if offset:
data = _take_offset(self, n, offset, columns)
elif columns:
select_columns = TakeCollectHelper.get_select_columns_function(self.schema, columns)
data = self._python.rdd.map(select_columns).take(n)
else:
data = self._python.rdd.take(n)
return data
def tally(
self, sample_col, count_val)
Count number of times a value is seen.
sample_col | (str): | The name of the column from which to compute the cumulative count. |
count_val | (str): | The column value to be used for the counts. |
A cumulative count is computed by sequentially stepping through the rows, observing the column values and keeping track of the number of times the specified count_value has been seen.
Consider Frame my_frame, which accesses a frame that contains a single column named obs:
>>> my_frame.inspect()
[#] obs
========
[0] 0
[1] 1
[2] 2
[3] 0
[4] 1
[5] 2
The cumulative percent count for column obs is obtained by:
>>> my_frame.tally("obs", "1")
[===Job Progress===]
The Frame my_frame accesses the original frame that now contains two columns, obs that contains the original column values, and obsCumulativePercentCount that contains the cumulative percent count:
>>> my_frame.inspect()
[#] obs obs_tally
===================
[0] 0 0.0
[1] 1 1.0
[2] 2 1.0
[3] 0 1.0
[4] 1 2.0
[5] 2 2.0
def tally(self, sample_col, count_val):
"""
Count number of times a value is seen.
Parameters
----------
:param sample_col: (str) The name of the column from which to compute the cumulative count.
:param count_val: (str) The column value to be used for the counts.
A cumulative count is computed by sequentially stepping through the rows,
observing the column values and keeping track of the number of times the specified
*count_value* has been seen.
Examples
--------
Consider Frame *my_frame*, which accesses a frame that contains a single
column named *obs*:
>>> my_frame.inspect()
[#] obs
========
[0] 0
[1] 1
[2] 2
[3] 0
[4] 1
[5] 2
The cumulative percent count for column *obs* is obtained by:
>>> my_frame.tally("obs", "1")
[===Job Progress===]
The Frame *my_frame* accesses the original frame that now contains two
columns, *obs* that contains the original column values, and
*obsCumulativePercentCount* that contains the cumulative percent count:
>>> my_frame.inspect()
[#] obs obs_tally
===================
[0] 0 0.0
[1] 1 1.0
[2] 2 1.0
[3] 0 1.0
[4] 1 2.0
[5] 2 2.0
"""
self._scala.tally(sample_col, count_val)
def tally_percent(
self, sample_col, count_val)
Compute a cumulative percent count.
sample_col | (str): | The name of the column from which to compute the cumulative sum. |
count_val | (str): | The column value to be used for the counts. |
A cumulative percent count is computed by sequentially stepping through the rows, observing the column values and keeping track of the percentage of the total number of times the specified count_value has been seen up to the current value.
Consider Frame my_frame, which accesses a frame that contains a single column named obs:
>>> my_frame.inspect()
[#] obs
========
[0] 0
[1] 1
[2] 2
[3] 0
[4] 1
[5] 2
The cumulative percent count for column obs is obtained by:
>>> my_frame.tally_percent("obs", "1")
[===Job Progress===]
The Frame my_frame accesses the original frame that now contains two columns, obs that contains the original column values, and obsCumulativePercentCount that contains the cumulative percent count:
>>> my_frame.inspect()
[#] obs obs_tally_percent
===========================
[0] 0 0.0
[1] 1 0.5
[2] 2 0.5
[3] 0 0.5
[4] 1 1.0
[5] 2 1.0
def tally_percent(self, sample_col, count_val):
"""
Compute a cumulative percent count.
Parameters
----------
:param sample_col: (str) The name of the column from which to compute the cumulative sum.
:param count_val: (str) The column value to be used for the counts.
A cumulative percent count is computed by sequentially stepping through
the rows, observing the column values and keeping track of the percentage of the
total number of times the specified *count_value* has been seen up to
the current value.
Examples
--------
Consider Frame *my_frame*, which accesses a frame that contains a single
column named *obs*:
>>> my_frame.inspect()
[#] obs
========
[0] 0
[1] 1
[2] 2
[3] 0
[4] 1
[5] 2
The cumulative percent count for column *obs* is obtained by:
>>> my_frame.tally_percent("obs", "1")
[===Job Progress===]
The Frame *my_frame* accesses the original frame that now contains two
columns, *obs* that contains the original column values, and
*obsCumulativePercentCount* that contains the cumulative percent count:
>>> my_frame.inspect()
[#] obs obs_tally_percent
===========================
[0] 0 0.0
[1] 1 0.5
[2] 2 0.5
[3] 0 0.5
[4] 1 1.0
[5] 2 1.0
"""
self._scala.tallyPercent(sample_col, count_val)
def timeseries_augmented_dickey_fuller_test(
self, ts_column, max_lag, regression='c')
Performs the Augmented Dickey-Fuller (ADF) Test, which tests the null hypothesis of whether a unit root is present in a time series sample. The test statistic that is returned in a negative number. The lower the value, the stronger the rejection of the hypothesis that there is a unit root at some level of confidence.
ts_column | (str): | Name of the column that contains the time series values to use with the ADF test. |
max_lag | (int): | The lag order to calculate the test statistic. |
regression | (Optional(str)): | The method of regression that was used. Following MacKinnon's notation, this can be "c" for constant, "nc" for no constant, "ct" for constant and trend, and "ctt" for constant, trend, and trend-squared. |
Returns | (AugmentedDickeyFullerTestResult): | Object contains the ADF test statistic and p-value. |
Consider the following frame of time series values:
>>> frame.inspect()
[#] timeseries_values
======================
[0] 3.201
[1] 3.3178
[2] 3.6279
[3] 3.5902
[4] 3.43
[5] 4.0546
[6] 3.7606
[7] 3.1231
[8] 3.2077
[9] 4.3383
Calculate augmented Dickey-Fuller test statistic by giving it the name of the column that has the time series values and the max_lag. The function returns an object that has properties for the p-value and test statistic.
>>> frame.timeseries_augmented_dickey_fuller_test("timeseries_values", 0)
p_value = 0.0
test_stat = -9.93422373369
def timeseries_augmented_dickey_fuller_test(self, ts_column, max_lag, regression = "c"):
"""
Performs the Augmented Dickey-Fuller (ADF) Test, which tests the null hypothesis of whether a unit root is present
in a time series sample. The test statistic that is returned in a negative number. The lower the value, the
stronger the rejection of the hypothesis that there is a unit root at some level of confidence.
Parameters
----------
:param ts_column: (str) Name of the column that contains the time series values to use with the ADF test.
:param max_lag: (int) The lag order to calculate the test statistic.
:param regression: (Optional(str)) The method of regression that was used. Following MacKinnon's notation, this
can be "c" for constant, "nc" for no constant, "ct" for constant and trend, and "ctt" for
constant, trend, and trend-squared.
:return: (AugmentedDickeyFullerTestResult) Object contains the ADF test statistic and p-value.
Example
-------
Consider the following frame of time series values:
>>> frame.inspect()
[#] timeseries_values
======================
[0] 3.201
[1] 3.3178
[2] 3.6279
[3] 3.5902
[4] 3.43
[5] 4.0546
[6] 3.7606
[7] 3.1231
[8] 3.2077
[9] 4.3383
Calculate augmented Dickey-Fuller test statistic by giving it the name of the column that has the time series
values and the max_lag. The function returns an object that has properties for the p-value and test statistic.
>>> frame.timeseries_augmented_dickey_fuller_test("timeseries_values", 0)
p_value = 0.0
test_stat = -9.93422373369
"""
if not isinstance(ts_column, str):
raise TypeError("ts_column parameter should be a str")
if not isinstance(max_lag, int):
raise TypeError("max_lag parameter should be a int")
if not isinstance(regression, str):
raise TypeError("regression parameter should be a str")
scala_result = self._scala.timeSeriesAugmentedDickeyFullerTest(ts_column, max_lag, regression)
return AugmentedDickeyFullerTestResult(scala_result)
def timeseries_breusch_godfrey_test(
self, residuals, factors, max_lag)
Calculates the Breusch-Godfrey test statistic for serial correlation.
residuals | (str): | Name of the column that contains residual (y) values |
factors | (List[str]): | Name of the column(s) that contain factors (x) values |
max_lag | (int): | The lag order to calculate the test statistic. |
Returns | (BreuschGodfreyTestResult): | Object contains the Breusch-Godfrey test statistic and p-value. |
Consider the following frame that uses a snippet of air quality and sensor data from:
https://archive.ics.uci.edu/ml/datasets/Air+Quality.
Lichman, M. (2013). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science.
>>> frame.inspect()
[#] Date Time CO_GT PT08_S1_CO NMHC_GT C6H6_GT Temp
====================================================================
[0] 10/03/2004 18.00.00 2.6 1360 150 11.9 13.6
[1] 10/03/2004 19.00.00 2.0 1292 112 9.4 13.3
[2] 10/03/2004 20.00.00 2.2 1402 88 9.0 11.9
[3] 10/03/2004 21.00.00 2.2 1376 80 9.2 11.0
[4] 10/03/2004 22.00.00 1.6 1272 51 6.5 11.2
[5] 10/03/2004 23.00.00 1.2 1197 38 4.7 11.2
[6] 11/03/2004 00.00.00 1.2 1185 31 3.6 11.3
[7] 11/03/2004 01.00.00 1.0 1136 31 3.3 10.7
[8] 11/03/2004 02.00.00 0.9 1094 24 2.3 10.7
[9] 11/03/2004 03.00.00 0.6 1010 19 1.7 10.3
Calcuate the Breusch-Godfrey test result:
>>> y_column = "Temp"
>>> x_columns = ['CO_GT', 'PT08_S1_CO', 'NMHC_GT', 'C6H6_GT']
>>> max_lag = 1
>>> result = frame.timeseries_breusch_godfrey_test(y_column, x_columns, max_lag)
>>> result
p_value = 0.00353847462468
test_stat = 8.50666768455
def timeseries_breusch_godfrey_test(self, residuals, factors, max_lag):
"""
Calculates the Breusch-Godfrey test statistic for serial correlation.
Parameters
----------
:param residuals: (str) Name of the column that contains residual (y) values
:param factors: (List[str]) Name of the column(s) that contain factors (x) values
:param max_lag: (int) The lag order to calculate the test statistic.
:return: (BreuschGodfreyTestResult) Object contains the Breusch-Godfrey test statistic and p-value.
Example
-------
Consider the following frame that uses a snippet of air quality and sensor data from:
https://archive.ics.uci.edu/ml/datasets/Air+Quality.
Lichman, M. (2013). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml].
Irvine, CA: University of California, School of Information and Computer Science.
>>> frame.inspect()
[#] Date Time CO_GT PT08_S1_CO NMHC_GT C6H6_GT Temp
====================================================================
[0] 10/03/2004 18.00.00 2.6 1360 150 11.9 13.6
[1] 10/03/2004 19.00.00 2.0 1292 112 9.4 13.3
[2] 10/03/2004 20.00.00 2.2 1402 88 9.0 11.9
[3] 10/03/2004 21.00.00 2.2 1376 80 9.2 11.0
[4] 10/03/2004 22.00.00 1.6 1272 51 6.5 11.2
[5] 10/03/2004 23.00.00 1.2 1197 38 4.7 11.2
[6] 11/03/2004 00.00.00 1.2 1185 31 3.6 11.3
[7] 11/03/2004 01.00.00 1.0 1136 31 3.3 10.7
[8] 11/03/2004 02.00.00 0.9 1094 24 2.3 10.7
[9] 11/03/2004 03.00.00 0.6 1010 19 1.7 10.3
Calcuate the Breusch-Godfrey test result:
>>> y_column = "Temp"
>>> x_columns = ['CO_GT', 'PT08_S1_CO', 'NMHC_GT', 'C6H6_GT']
>>> max_lag = 1
>>> result = frame.timeseries_breusch_godfrey_test(y_column, x_columns, max_lag)
>>> result
p_value = 0.00353847462468
test_stat = 8.50666768455
"""
if not isinstance(residuals, str):
raise TypeError("residuals parameter should be a str (column name).")
if isinstance(factors, str):
factors = [factors]
if not isinstance(factors, list):
raise TypeError("factors parameter should be a list of strings (column names).")
if not isinstance(max_lag, int):
raise TypeError("max_lag parameter should be an integer.")
scala_result = self._scala.timeSeriesBreuschGodfreyTest(residuals,
self._tc.jutils.convert.to_scala_list_string(factors),
max_lag)
return BreuschGodfreyTestResult(scala_result)
def timeseries_breusch_pagan_test(
self, residuals, factors)
Peforms the Breusch-Pagan test for heteroskedasticity.
residuals | (str): | Name of the column that contains residual (y) values |
factors | (List[str]): | Name of the column(s) that contain factors (x) values |
Returns | (BreuschPaganTestResult): | Object contains the Breusch-Pagan test statistic and p-value. |
Consider the following frame:
>>> frame.inspect()
[#] AT V AP RH PE
=========================================
[0] 8.34 40.77 1010.84 90.01 480.48
[1] 23.64 58.49 1011.4 74.2 445.75
[2] 29.74 56.9 1007.15 41.91 438.76
[3] 19.07 49.69 1007.22 76.79 453.09
[4] 11.8 40.66 1017.13 97.2 464.43
[5] 13.97 39.16 1016.05 84.6 470.96
[6] 22.1 71.29 1008.2 75.38 442.35
[7] 14.47 41.76 1021.98 78.41 464
[8] 31.25 69.51 1010.25 36.83 428.77
[9] 6.77 38.18 1017.8 81.13 484.3
Calculate the Bruesh-Pagan test statistic where the "AT" column contains residual values and the other columns are factors:
result = frame.timeseries_breusch_pagan_test("AT",["V","AP","RH","PE"]) [===Job Progress===]
The result contains the test statistic and p-value:
result p_value = 0.000147089380721 test_stat = 22.6741588802
def timeseries_breusch_pagan_test(self, residuals, factors):
"""
Peforms the Breusch-Pagan test for heteroskedasticity.
Parameters
----------
:param residuals: (str) Name of the column that contains residual (y) values
:param factors: (List[str]) Name of the column(s) that contain factors (x) values
:return: (BreuschPaganTestResult) Object contains the Breusch-Pagan test statistic and p-value.
Example
-------
Consider the following frame:
>>> frame.inspect()
[#] AT V AP RH PE
=========================================
[0] 8.34 40.77 1010.84 90.01 480.48
[1] 23.64 58.49 1011.4 74.2 445.75
[2] 29.74 56.9 1007.15 41.91 438.76
[3] 19.07 49.69 1007.22 76.79 453.09
[4] 11.8 40.66 1017.13 97.2 464.43
[5] 13.97 39.16 1016.05 84.6 470.96
[6] 22.1 71.29 1008.2 75.38 442.35
[7] 14.47 41.76 1021.98 78.41 464
[8] 31.25 69.51 1010.25 36.83 428.77
[9] 6.77 38.18 1017.8 81.13 484.3
Calculate the Bruesh-Pagan test statistic where the "AT" column contains residual values and the other columns are
factors:
>>> result = frame.timeseries_breusch_pagan_test("AT",["V","AP","RH","PE"])
[===Job Progress===]
The result contains the test statistic and p-value:
>>> result
p_value = 0.000147089380721
test_stat = 22.6741588802
"""
if not isinstance(residuals, str):
raise TypeError("residuals parameter should be a str (column name).")
if isinstance(factors, str):
factors = [factors]
if not isinstance(factors, list):
raise TypeError("factors parameter should be a list of strings (column names).")
scala_result = self._scala.timeSeriesBreuschPaganTest(residuals,
self._tc.jutils.convert.to_scala_list_string(factors))
return BreuschPaganTestResult(scala_result)
def timeseries_durbin_watson_test(
self, residuals)
Computes the Durbin-Watson test statistic used to determine the presence of serial correlation in the residuals. Serial correlation can show a relationship between values separated from each other by a given time lag. A value close to 0.0 gives evidence for positive serial correlation, a value close to 4.0 gives evidence for negative serial correlation, and a value close to 2.0 gives evidence for no serial correlation.
residuals | (str): | Name of the column that contains residual values |
Returns: | Durbin-Watson statistics test |
In this example, we have a frame that contains time series values. The inspect command below shows a snippet of what the data looks like:
>>> frame.inspect()
[#] timeseries_values
======================
[0] 3.201
[1] 3.3178
[2] 3.6279
[3] 3.5902
[4] 3.43
[5] 4.0546
[6] 3.7606
[7] 3.1231
[8] 3.2077
[9] 4.3383
Calculate Durbin-Watson test statistic by giving it the name of the column that has the time series values:
>>> frame.timeseries_durbin_watson_test("timeseries_values")
0.02678674777710402
def timeseries_durbin_watson_test(self, residuals):
"""
Computes the Durbin-Watson test statistic used to determine the presence of serial correlation in the residuals.
Serial correlation can show a relationship between values separated from each other by a given time lag. A value
close to 0.0 gives evidence for positive serial correlation, a value close to 4.0 gives evidence for negative
serial correlation, and a value close to 2.0 gives evidence for no serial correlation.
:param residuals: (str) Name of the column that contains residual values
:return: Durbin-Watson statistics test
Example
-------
In this example, we have a frame that contains time series values. The inspect command below shows a snippet of
what the data looks like:
>>> frame.inspect()
[#] timeseries_values
======================
[0] 3.201
[1] 3.3178
[2] 3.6279
[3] 3.5902
[4] 3.43
[5] 4.0546
[6] 3.7606
[7] 3.1231
[8] 3.2077
[9] 4.3383
Calculate Durbin-Watson test statistic by giving it the name of the column that has the time series values:
>>> frame.timeseries_durbin_watson_test("timeseries_values")
0.02678674777710402
"""
if not isinstance(residuals, str):
raise TypeError("residuals should be a str (column name).")
return self._scala.timeSeriesDurbinWatsonTest(residuals)
def timeseries_from_observations(
self, date_time_index, timestamp_column, key_column, value_column)
Returns a frame that has the observations formatted as a time series.
date_time_index: | List of date/time strings. DateTimeIndex to conform all series to. |
timestamp_column: | The name of the column telling when the observation occurred. |
key_column: | The name of the column that contains which string key the observation belongs to. |
value_column: | The name of the column that contains the observed value. |
Returns: | Frame formatted as a time series (with a column for key and a column for the vector of values). |
Uses the specified timestamp, key, and value columns and the date/time index provided to format the observations as a time series. The time series frame will have columns for the key and a vector of the observed values that correspond to the date/time index.
In this example, we will use a frame of observations of resting heart rate for three individuals over three days. The data is accessed from Frame object called my_frame:
>>> my_frame.inspect(my_frame.count())
[#] name date resting_heart_rate
======================================================
[0] Edward 2016-01-01T12:00:00Z 62
[1] Stanley 2016-01-01T12:00:00Z 57
[2] Edward 2016-01-02T12:00:00Z 63
[3] Sarah 2016-01-02T12:00:00Z 64
[4] Stanley 2016-01-02T12:00:00Z 57
[5] Edward 2016-01-03T12:00:00Z 62
[6] Sarah 2016-01-03T12:00:00Z 64
[7] Stanley 2016-01-03T12:00:00Z 56
We then need to create an array that contains the date/time index, which will be used when creating the time series. Since our data is for three days, our date/time index will just contain those three dates:
>>> datetimeindex = ["2016-01-01T12:00:00.000Z","2016-01-02T12:00:00.000Z","2016-01-03T12:00:00.000Z"]
Then we can create our time series frame by specifying our date/time index along with the name of our timestamp column (in this example, it's "date"), key column (in this example, it's "name"), and value column (in this example, it's "resting_heart_rate").
>>> ts = my_frame.timeseries_from_observations(datetimeindex, "date", "name", "resting_heart_rate")
[===Job Progress===]
Take a look at the resulting time series frame schema and contents:
>>> ts.schema
[(u'name', <type 'unicode'>), (u'resting_heart_rate', vector(3))]
>>> ts.inspect()
[#] name resting_heart_rate
================================
[0] Stanley [57.0, 57.0, 56.0]
[1] Edward [62.0, 63.0, 62.0]
[2] Sarah [None, 64.0, 64.0]
def timeseries_from_observations(self, date_time_index, timestamp_column, key_column, value_column):
"""
Returns a frame that has the observations formatted as a time series.
:param date_time_index: List of date/time strings. DateTimeIndex to conform all series to.
:param timestamp_column: The name of the column telling when the observation occurred.
:param key_column: The name of the column that contains which string key the observation belongs to.
:param value_column: The name of the column that contains the observed value.
:return: Frame formatted as a time series (with a column for key and a column for the vector of values).
Uses the specified timestamp, key, and value columns and the date/time index provided to format the observations
as a time series. The time series frame will have columns for the key and a vector of the observed values that
correspond to the date/time index.
Examples
--------
In this example, we will use a frame of observations of resting heart rate for three individuals over three days.
The data is accessed from Frame object called *my_frame*:
>>> my_frame.inspect(my_frame.count())
[#] name date resting_heart_rate
======================================================
[0] Edward 2016-01-01T12:00:00Z 62
[1] Stanley 2016-01-01T12:00:00Z 57
[2] Edward 2016-01-02T12:00:00Z 63
[3] Sarah 2016-01-02T12:00:00Z 64
[4] Stanley 2016-01-02T12:00:00Z 57
[5] Edward 2016-01-03T12:00:00Z 62
[6] Sarah 2016-01-03T12:00:00Z 64
[7] Stanley 2016-01-03T12:00:00Z 56
We then need to create an array that contains the date/time index,
which will be used when creating the time series. Since our data
is for three days, our date/time index will just contain those
three dates:
>>> datetimeindex = ["2016-01-01T12:00:00.000Z","2016-01-02T12:00:00.000Z","2016-01-03T12:00:00.000Z"]
Then we can create our time series frame by specifying our date/time
index along with the name of our timestamp column (in this example, it's
"date"), key column (in this example, it's "name"), and value column (in
this example, it's "resting_heart_rate").
>>> ts = my_frame.timeseries_from_observations(datetimeindex, "date", "name", "resting_heart_rate")
[===Job Progress===]
Take a look at the resulting time series frame schema and contents:
>>> ts.schema
[(u'name', ), (u'resting_heart_rate', vector(3))]
>>> ts.inspect()
[#] name resting_heart_rate
================================
[0] Stanley [57.0, 57.0, 56.0]
[1] Edward [62.0, 63.0, 62.0]
[2] Sarah [None, 64.0, 64.0]
"""
if not isinstance(date_time_index, list):
raise TypeError("date_time_index should be a list of date/times")
scala_date_list = self._tc.jutils.convert.to_scala_date_time_list(date_time_index)
from sparktk.frame.frame import Frame
return Frame(self._tc,
self._scala.timeSeriesFromObseravations(scala_date_list, timestamp_column, key_column, value_column))
def timeseries_slice(
self, date_time_index, start, end)
Returns a frame split on the specified start and end date/times.
Splits a time series frame on the specified start and end date/times.
date_time_index: | List of date/time strings. DateTimeIndex to conform all series to. |
start: | The start date for the slice in the ISO 8601 format, like: yyyy-MM-dd'T'HH:mm:ss.SSSZ |
end: | The end date for the slice in the ISO 8601 format, like: yyyy-MM-dd'T'HH:mm:ss.SSSZ |
Returns: | Frame that contains a sub-slice of the current frame, based on the specified start/end date/times. |
For this example, we start with a frame that has already been formatted as a time series. This means that the frame has a string column for key and a vector column that contains a series of the observed values. We must also know the date/time index that corresponds to the time series.
The time series is in a Frame object called ts_frame.
>>> ts_frame.inspect()
[#] key series
==================================
[0] A [62, 55, 60, 61, 60, 59]
[1] B [60, 58, 61, 62, 60, 61]
[2] C [69, 68, 68, 70, 71, 69]
Next, we define the date/time index. In this example, it is one day intervals from 2016-01-01 to 2016-01-06:
>>> datetimeindex = ["2016-01-01T12:00:00.000Z","2016-01-02T12:00:00.000Z","2016-01-03T12:00:00.000Z","2016-01-04T12:00:00.000Z","2016-01-05T12:00:00.000Z","2016-01-06T12:00:00.000Z"]
Get a slice of our time series from 2016-01-02 to 2016-01-04:
>>> slice_start = "2016-01-02T12:00:00.000Z"
>>> slice_end = "2016-01-04T12:00:00.000Z"
>>> sliced_frame = ts_frame.timeseries_slice(datetimeindex, slice_start, slice_end)
[===Job Progress===]
Take a look at our sliced time series:
>>> sliced_frame.inspect()
[#] key series
============================
[0] A [55.0, 60.0, 61.0]
[1] B [58.0, 61.0, 62.0]
[2] C [68.0, 68.0, 70.0]
def timeseries_slice(self, date_time_index, start, end):
"""
Returns a frame split on the specified start and end date/times.
Splits a time series frame on the specified start and end date/times.
:param date_time_index: List of date/time strings. DateTimeIndex to conform all series to.
:param start: The start date for the slice in the ISO 8601 format, like: yyyy-MM-dd'T'HH:mm:ss.SSSZ
:param end: The end date for the slice in the ISO 8601 format, like: yyyy-MM-dd'T'HH:mm:ss.SSSZ
:return: Frame that contains a sub-slice of the current frame, based on the specified start/end date/times.
Examples
--------
For this example, we start with a frame that has already been formatted as a time series.
This means that the frame has a string column for key and a vector column that contains
a series of the observed values. We must also know the date/time index that corresponds
to the time series.
The time series is in a Frame object called *ts_frame*.
>>> ts_frame.inspect()
[#] key series
==================================
[0] A [62, 55, 60, 61, 60, 59]
[1] B [60, 58, 61, 62, 60, 61]
[2] C [69, 68, 68, 70, 71, 69]
Next, we define the date/time index. In this example, it is one day intervals from
2016-01-01 to 2016-01-06:
>>> datetimeindex = ["2016-01-01T12:00:00.000Z","2016-01-02T12:00:00.000Z","2016-01-03T12:00:00.000Z","2016-01-04T12:00:00.000Z","2016-01-05T12:00:00.000Z","2016-01-06T12:00:00.000Z"]
Get a slice of our time series from 2016-01-02 to 2016-01-04:
>>> slice_start = "2016-01-02T12:00:00.000Z"
>>> slice_end = "2016-01-04T12:00:00.000Z"
>>> sliced_frame = ts_frame.timeseries_slice(datetimeindex, slice_start, slice_end)
[===Job Progress===]
Take a look at our sliced time series:
>>> sliced_frame.inspect()
[#] key series
============================
[0] A [55.0, 60.0, 61.0]
[1] B [58.0, 61.0, 62.0]
[2] C [68.0, 68.0, 70.0]
"""
if not isinstance(date_time_index, list):
raise TypeError("date_time_index should be a list of date/times")
if not isinstance(start, basestring):
raise TypeError("start date/time should be a string in the ISO 8601 format")
if not isinstance(end, basestring):
raise TypeError("end date/time should be a string in the ISO 8601 format")
from sparktk.frame.frame import Frame
return Frame(self._tc,
self._scala.timeSeriesSlice(self._tc.jutils.convert.to_scala_date_time_list(date_time_index),
self._tc.jutils.convert.to_scala_date_time(start),
self._tc.jutils.convert.to_scala_date_time(end)))
def to_pandas(
self, n=None, offset=0, columns=None)
Brings data into a local pandas dataframe.
Similar to the 'take' function, but puts the data into a pandas dataframe.
n | (Optional(int)): | The number of rows to get from the frame (warning: do not overwhelm the python session by taking too much) |
offset | (Optional(int)): | The number of rows to skip before copying. Defaults to 0. |
columns | (Optional(List[str])): | Column filter. The list of names to be included. Default is all columns. |
Returns | (pandas.DataFrame): | A new pandas dataframe object containing the taken frame data. |
Consider the following spark-tk frame, where we have columns for name and phone number:
>>> frame.inspect()
[#] name phone
=======================
[0] Fred 555-1234
[1] Susan 555-0202
[2] Thurston 555-4510
[3] Judy 555-2183
>>> frame.schema
[('name', <type 'str'>), ('phone', <type 'str'>)]
The frame to_pandas() method is used to get a pandas DataFrame that contains the data from the spark-tk frame. Note that since no parameters are provided when to_pandas() is called, the default values are used for the number of rows, the row offset, and the columns.
>>> pandas_frame = frame.to_pandas()
>>> pandas_frame
name phone
0 Fred 555-1234
1 Susan 555-0202
2 Thurston 555-4510
3 Judy 555-2183
def to_pandas(self, n=None, offset=0, columns=None):
"""
Brings data into a local pandas dataframe.
Similar to the 'take' function, but puts the data into a pandas dataframe.
Parameters
----------
:param n: (Optional(int)) The number of rows to get from the frame (warning: do not overwhelm the python session
by taking too much)
:param offset: (Optional(int)) The number of rows to skip before copying. Defaults to 0.
:param columns: (Optional(List[str])) Column filter. The list of names to be included. Default is all columns.
:return: (pandas.DataFrame) A new pandas dataframe object containing the taken frame data.
Examples
--------
Consider the following spark-tk frame, where we have columns for name and phone number:
>>> frame.inspect()
[#] name phone
=======================
[0] Fred 555-1234
[1] Susan 555-0202
[2] Thurston 555-4510
[3] Judy 555-2183
>>> frame.schema
[('name', ), ('phone', )]
The frame to_pandas() method is used to get a pandas DataFrame that contains the data from the spark-tk frame. Note
that since no parameters are provided when to_pandas() is called, the default values are used for the number of
rows, the row offset, and the columns.
>>> pandas_frame = frame.to_pandas()
>>> pandas_frame
name phone
0 Fred 555-1234
1 Susan 555-0202
2 Thurston 555-4510
3 Judy 555-2183
"""
try:
import pandas
except:
raise RuntimeError("pandas module not found, unable to download. Install pandas or try the take command.")
from sparktk.frame.ops.take import take_rich
result = take_rich(self, n, offset, columns)
headers, data_types = zip(*result.schema)
frame_data = result.data
from sparktk import dtypes
import datetime
date_time_columns = [i for i, x in enumerate(self.schema) if x[1] in (dtypes.datetime, datetime.datetime)]
has_date_time = len(date_time_columns) > 0
# translate our datetime long to datetime, so that it gets into the pandas df as a datetime column
def long_to_date_time(row):
for i in date_time_columns:
if isinstance(row[i], long):
row[i] = datetime.datetime.fromtimestamp(row[i]//1000).replace(microsecond=row[i]%1000*1000)
return row
if (has_date_time):
frame_data = map(long_to_date_time, frame_data)
# create pandas df
pandas_df = pandas.DataFrame(frame_data, columns=headers)
for i, dtype in enumerate(data_types):
dtype_str = _sparktk_dtype_to_pandas_str(dtype)
try:
pandas_df[[headers[i]]] = pandas_df[[headers[i]]].astype(dtype_str)
except (TypeError, ValueError):
if dtype_str.startswith("int"):
# DataFrame does not handle missing values in int columns. If we get this error, use the 'object' datatype instead.
print "WARNING - Encountered problem casting column %s to %s, possibly due to missing values (i.e. presence of None). Continued by casting column %s as 'object'" % (headers[i], dtype_str, headers[i])
pandas_df[[headers[i]]] = pandas_df[[headers[i]]].astype("object")
else:
raise
return pandas_df
def top_k(
self, column_name, k, weight_column=None)
Most or least frequent column values.
column_name | (str): | The column whose top (or bottom) K distinct values are to be calculated. |
k | (int): | Number of entries to return (If k is negative, return bottom k). |
weight_column | (Optional[str]): | The column that provides weights (frequencies) for the topK calculation. Must contain numerical data. Default is 1 for all items. |
Calculate the top (or bottom) K distinct values by count of a column. The column can be weighted. All data elements of weight <= 0 are excluded from the calculation, as are all data elements whose weight is NaN or infinite. If there are no data elements of finite weight > 0, then topK is empty.
For this example, we calculate the top 2 counties in a data frame:
Consider the following frame:
>>> frame.inspect(frame.count())
[##] rank city population_2013 population_2010 change county
=============================================================================
[0] 1 Portland 609456 583776 4.40% Multnomah
[1] 2 Salem 160614 154637 3.87% Marion
[2] 3 Eugene 159190 156185 1.92% Lane
[3] 4 Gresham 109397 105594 3.60% Multnomah
[4] 5 Hillsboro 97368 91611 6.28% Washington
[5] 6 Beaverton 93542 89803 4.16% Washington
[6] 15 Grants Pass 35076 34533 1.57% Josephine
[7] 16 Oregon City 34622 31859 8.67% Clackamas
[8] 17 McMinnville 33131 32187 2.93% Yamhill
[9] 18 Redmond 27427 26215 4.62% Deschutes
[10] 19 Tualatin 26879 26054 4.17% Washington
[11] 20 West Linn 25992 25109 3.52% Clackamas
[12] 7 Bend 81236 76639 6.00% Deschutes
[13] 8 Medford 77677 74907 3.70% Jackson
[14] 9 Springfield 60177 59403 1.30% Lane
[15] 10 Corvallis 55298 54462 1.54% Benton
[16] 11 Albany 51583 50158 2.84% Linn
[17] 12 Tigard 50444 48035 5.02% Washington
[18] 13 Lake Oswego 37610 36619 2.71% Clackamas
[19] 14 Keizer 37064 36478 1.61% Marion
>>> top_frame = frame.top_k("county", 2)
[===Job Progress===]
>>> top_frame.inspect()
[#] county count
======================
[0] Washington 4.0
[1] Clackamas 3.0
def top_k(self, column_name, k, weight_column=None):
"""
Most or least frequent column values.
Parameters
----------
:param column_name: (str) The column whose top (or bottom) K distinct values are to be calculated.
:param k: (int) Number of entries to return (If k is negative, return bottom k).
:param weight_column: (Optional[str]) The column that provides weights (frequencies) for the topK calculation.
Must contain numerical data. Default is 1 for all items.
Calculate the top (or bottom) K distinct values by count of a column. The column can be
weighted. All data elements of weight <= 0 are excluded from the calculation, as are
all data elements whose weight is NaN or infinite. If there are no data elements of
finite weight > 0, then topK is empty.
Examples
--------
For this example, we calculate the top 2 counties in a data frame:
Consider the following frame:
>>> frame.inspect(frame.count())
[##] rank city population_2013 population_2010 change county
=============================================================================
[0] 1 Portland 609456 583776 4.40% Multnomah
[1] 2 Salem 160614 154637 3.87% Marion
[2] 3 Eugene 159190 156185 1.92% Lane
[3] 4 Gresham 109397 105594 3.60% Multnomah
[4] 5 Hillsboro 97368 91611 6.28% Washington
[5] 6 Beaverton 93542 89803 4.16% Washington
[6] 15 Grants Pass 35076 34533 1.57% Josephine
[7] 16 Oregon City 34622 31859 8.67% Clackamas
[8] 17 McMinnville 33131 32187 2.93% Yamhill
[9] 18 Redmond 27427 26215 4.62% Deschutes
[10] 19 Tualatin 26879 26054 4.17% Washington
[11] 20 West Linn 25992 25109 3.52% Clackamas
[12] 7 Bend 81236 76639 6.00% Deschutes
[13] 8 Medford 77677 74907 3.70% Jackson
[14] 9 Springfield 60177 59403 1.30% Lane
[15] 10 Corvallis 55298 54462 1.54% Benton
[16] 11 Albany 51583 50158 2.84% Linn
[17] 12 Tigard 50444 48035 5.02% Washington
[18] 13 Lake Oswego 37610 36619 2.71% Clackamas
[19] 14 Keizer 37064 36478 1.61% Marion
>>> top_frame = frame.top_k("county", 2)
[===Job Progress===]
>>> top_frame.inspect()
[#] county count
======================
[0] Washington 4.0
[1] Clackamas 3.0
"""
from sparktk.frame.frame import Frame
return Frame(self._tc, self._scala.topK(column_name, k, self._tc.jutils.convert.to_scala_option(weight_column)))
def unflatten_columns(
self, columns, delimiter=',')
Compacts data from multiple rows based on cell data.
columns | (str or List[str]): | Name of the column(s) to be used as keys for unflattening. |
delimiter | (Optional[str]): | Separator for the data in the result columns. Default is comma (,). |
Groups together cells in all columns (less the composite key) using "," as string delimiter. The original rows are deleted. The grouping takes place based on a composite key created from cell values. The column datatypes are changed to string.
Given a data file::
user1 1/1/2015 1 70
user1 1/1/2015 2 60
user2 1/1/2015 1 65
The commands to bring the data into a frame, where it can be worked on:
>>> frame.inspect()
[#] a b c d
===========================
[0] user1 1/1/2015 1 70
[1] user1 1/1/2015 2 60
[2] user2 1/1/2015 1 65
Unflatten the data using columns a & b:
>>> frame.unflatten_columns(['a','b'])
[===Job Progress===]
Check again:
>>> frame.inspect()
[#] a b c d
================================
[0] user1 1/1/2015 1,2 70,60
[1] user2 1/1/2015 1 65
Alternatively, unflatten_columns() also accepts a single column like:
>>> frame.unflatten_columns('a')
[===Job Progress===]
>>> frame.inspect()
[#] a b c d
=========================================
[0] user1 1/1/2015,1/1/2015 1,2 70,60
[1] user2 1/1/2015 1 65
def unflatten_columns (self, columns, delimiter=","):
"""
Compacts data from multiple rows based on cell data.
Parameters
----------
:param columns: (str or List[str]) Name of the column(s) to be used as keys for unflattening.
:param delimiter: (Optional[str]) Separator for the data in the result columns. Default is comma (,).
Groups together cells in all columns (less the composite key) using "," as string delimiter.
The original rows are deleted.
The grouping takes place based on a composite key created from cell values.
The column datatypes are changed to string.
Examples
--------
Given a data file::
user1 1/1/2015 1 70
user1 1/1/2015 2 60
user2 1/1/2015 1 65
The commands to bring the data into a frame, where it can be worked on:
>>> frame.inspect()
[#] a b c d
===========================
[0] user1 1/1/2015 1 70
[1] user1 1/1/2015 2 60
[2] user2 1/1/2015 1 65
Unflatten the data using columns a & b:
>>> frame.unflatten_columns(['a','b'])
[===Job Progress===]
Check again:
>>> frame.inspect()
[#] a b c d
================================
[0] user1 1/1/2015 1,2 70,60
[1] user2 1/1/2015 1 65
Alternatively, unflatten_columns() also accepts a single column like:
>>> frame.unflatten_columns('a')
[===Job Progress===]
>>> frame.inspect()
[#] a b c d
=========================================
[0] user1 1/1/2015,1/1/2015 1,2 70,60
[1] user2 1/1/2015 1 65
"""
if not isinstance(columns, list):
columns = [columns]
return self._scala.unflattenColumns(self._tc.jutils.convert.to_scala_list_string(columns),
delimiter)
def validate_pyrdd_schema(
self, pyrdd, schema)
def validate_pyrdd_schema(self, pyrdd, schema):
if isinstance(pyrdd, RDD):
schema_length = len(schema)
num_bad_values = self._tc.sc.accumulator(0)
def validate_schema(row, accumulator):
data = []
if len(row) != schema_length:
raise ValueError("Length of the row (%s) does not match the schema length (%s)." % (len(row), len(schema)))
for index, column in enumerate(schema):
data_type = column[1]
try:
if row[index] is not None:
data.append(dtypes.dtypes.cast(row[index], data_type))
except:
data.append(None)
accumulator += 1
return data
validated_rdd = pyrdd.map(lambda row: validate_schema(row, num_bad_values))
# Force rdd to load, so that we can get a bad value count
validated_rdd.count()
return SchemaValidationReturn(validated_rdd, num_bad_values.value)
else:
raise TypeError("Unable to validate schema, because the pyrdd provided is not an RDD.")