sparktk.frame.frame module

Show source ≡

# vim: set encoding=utf-8

#  Copyright (c) 2016 Intel Corporation 
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#       http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
#

from pyspark.rdd import RDD
from pyspark.sql import DataFrame

from sparktk.frame.pyframe import PythonFrame
from sparktk.frame.schema import schema_to_python, schema_to_scala
from sparktk import dtypes
import logging
logger = logging.getLogger('sparktk')
from sparktk.propobj import PropertiesObject
from sparktk import TkContext

# import constructors for the API's sake (not actually dependencies of the Frame class)
from sparktk.frame.constructors.create import create
from sparktk.frame.constructors.import_csv import import_csv
from sparktk.frame.constructors.import_hbase import import_hbase
from sparktk.frame.constructors.import_hive import import_hive
from sparktk.frame.constructors.import_jdbc import import_jdbc
from sparktk.frame.constructors.import_pandas import import_pandas

__all__ = ["create",
           "Frame",
           "import_csv",
           "import_hbase",
           "import_hive",
           "import_jdbc",
           "import_pandas",
           "load"]


class Frame(object):
    
    def __init__(self, tc, source, schema=None, validate_schema=False):
        """(Private constructor -- use tc.frame.create or other methods available from the TkContext)"""
        self._tc = tc
        if self._is_scala_frame(source):
            self._frame = source
        elif self._is_scala_rdd(source):
            scala_schema = schema_to_scala(tc.sc, schema)
            self._frame = self._create_scala_frame(tc.sc, source, scala_schema)
        elif self._is_scala_dataframe(source):
            self._frame = self._create_scala_frame_from_scala_dataframe(tc.sc, source)
        elif isinstance(source, DataFrame):
            self._frame = self._create_scala_frame_from_scala_dataframe(tc.sc, source._jdf)
        elif isinstance(source, PythonFrame):
            self._frame = source
        else:
            if not isinstance(source, RDD):
                if not isinstance(source, list) or (len(source) > 0 and any(not isinstance(row, (list, tuple)) for row in source)):
                    raise TypeError("Invalid data source.  The data parameter must be a 2-dimensional list (list of row data) or an RDD.")

                inferred_schema = False
                if isinstance(schema, list):
                    if all(isinstance(item, basestring) for item in schema):
                        # check if schema is just a list of column names (versus string and data type tuples)
                        schema = self._infer_schema(source, schema)
                        inferred_schema = True
                    elif not all(isinstance(item, tuple) and
                                  len(item) == 2 and
                                  isinstance(item[0], basestring) for item in schema):
                        raise TypeError("Invalid schema.  Expected a list of tuples (str, type) with the column name and data type, but received type %s." % type(schema))
                    # check for duplicate column names
                    column_names = [col[0] for col in schema]
                    duplicate_column_names = set([col for col in column_names if column_names.count(col) > 1])
                    if len(duplicate_column_names) > 0:
                        raise ValueError("Invalid schema, column names cannot be duplicated: %s" % ", ".join(duplicate_column_names))
                elif schema is None:
                    schema = self._infer_schema(source)
                    inferred_schema = True
                else:
                    # Schema is not a list or None
                    raise TypeError("Invalid schema type: %s.  Expected a list of tuples (str, type) with the column name and data type." % type(schema))
                for item in schema:
                    if not self._is_supported_datatype(item[1]):
                        if inferred_schema:
                            raise TypeError("The %s data type was found when inferring the schema, and it is not a "
                                            "supported data type.  Instead, specify a schema that uses a supported data "
                                            "type, and enable validate_schema so that the data is converted to the proper "
                                            "data type.\n\nInferred schema: %s\n\nSupported data types: %s" %
                                            (str(item[1]), str(schema), dtypes.dtypes))
                        else:
                            raise TypeError("Invalid schema.  %s is not a supported data type.\n\nSupported data types: %s" %
                                            (str(item[1]), dtypes.dtypes))

                source = tc.sc.parallelize(source)
            if schema and validate_schema:
                # Validate schema by going through the data and checking the data type and attempting to parse it
                validate_schema_result = self.validate_pyrdd_schema(source, schema)
                source = validate_schema_result.validated_rdd
                logger.debug("%s values were unable to be parsed to the schema's data type." % validate_schema_result.bad_value_count)

            # If schema contains matrix datatype, then apply type_coercer to convert list[list] to numpy ndarray
            map_source = MatrixCoercion.schema_is_coercible(source, list(schema))
            self._frame = PythonFrame(map_source, schema)

    def _merge_types(self, type_list_a, type_list_b):
        """
        Merges two lists of data types

        :param type_list_a: First list of data types to merge
        :param type_list_b: Second list of data types to merge
        :return: List of merged data types
        """
        if not isinstance(type_list_a, list) or not isinstance(type_list_b, list):
            raise TypeError("Unable to generate schema, because schema is not a list.")
        if len(type_list_a) != len(type_list_b):
            raise ValueError("Length of each row must be the same (found rows with lengths: %s and %s)." % (len(type_list_a), len(type_list_b)))
        return [dtypes._DataTypes.merge_types(type_list_a[i], type_list_b[i]) for i in xrange(0, len(type_list_a))]

    def _infer_types_for_row(self, row):
        """
        Returns a list of data types for the data in the specified row

        :param row: List or Row of data
        :return: List of data types
        """
        inferred_types = []
        for item in row:
            if item is None:
                inferred_types.append(int)
            elif not isinstance(item, list):
                inferred_types.append(type(item))
            else:
                inferred_types.append(dtypes.vector((len(item))))
        return inferred_types

    def _infer_schema(self, data, column_names=[], sample_size=100):
        """
        Infers the schema based on the data in the RDD.

        :param sc: Spark Context
        :param data: Data used to infer schema
        :param column_names: Optional column names to use in the schema.  If no column names are provided, columns
                             are given numbered names.  If there are more columns in the RDD than there are in the
                             column_names list, remaining columns will be numbered.
        :param sample_size: Number of rows to check when inferring the schema.  Defaults to 100.
        :return: Schema
        """
        inferred_schema = []

        if isinstance(data, list):
            if len(data) > 0:
                # get the schema for the first row
                data_types = self._infer_types_for_row(data[0])

                sample_size = min(sample_size, len(data))

                for i in xrange (1, sample_size):
                    data_types = self._merge_types(data_types, self._infer_types_for_row(data[i]))

                for i, data_type in enumerate(data_types):
                    column_name = "C%s" % i
                    if len(column_names) > i:
                        column_name = column_names[i]
                    inferred_schema.append((column_name, data_type))
        else:
            raise TypeError("Unable to infer schema, because the data provided is not a list.")
        return inferred_schema

    def _is_supported_datatype(self, data_type):
        """
        Returns True if the specified data_type is supported.
        """
        supported_primitives = [int, float, long, str, unicode]
        if data_type in supported_primitives:
            return True
        elif data_type is dtypes.datetime:
            return True
        elif type(data_type) is dtypes.vector:
            return True
        elif data_type is dtypes.matrix:
            return True
        else:
            return False

    def validate_pyrdd_schema(self, pyrdd, schema):
        if isinstance(pyrdd, RDD):
            schema_length = len(schema)
            num_bad_values = self._tc.sc.accumulator(0)

            def validate_schema(row, accumulator):
                data = []
                if len(row) != schema_length:
                    raise ValueError("Length of the row (%s) does not match the schema length (%s)." % (len(row), len(schema)))
                for index, column in enumerate(schema):
                    data_type = column[1]
                    try:
                        if row[index] is not None:
                            data.append(dtypes.dtypes.cast(row[index], data_type))
                    except:
                        data.append(None)
                        accumulator += 1
                return data

            validated_rdd = pyrdd.map(lambda row: validate_schema(row, num_bad_values))

            # Force rdd to load, so that we can get a bad value count
            validated_rdd.count()

            return SchemaValidationReturn(validated_rdd, num_bad_values.value)
        else:
            raise TypeError("Unable to validate schema, because the pyrdd provided is not an RDD.")

    @staticmethod
    def _create_scala_frame(sc, scala_rdd, scala_schema):
        """call constructor in JVM"""
        return sc._jvm.org.trustedanalytics.sparktk.frame.Frame(scala_rdd, scala_schema, False)

    @staticmethod
    def _create_scala_frame_from_scala_dataframe(sc, scala_dataframe):
        """call constructor in JVM"""
        return sc._jvm.org.trustedanalytics.sparktk.frame.Frame(scala_dataframe)

    @staticmethod
    def _from_scala(tc, scala_frame):
        """creates a python Frame for the given scala Frame"""
        return Frame(tc, scala_frame)

    def _frame_to_scala(self, python_frame):
        """converts a PythonFrame to a Scala Frame"""
        scala_schema = schema_to_scala(self._tc.sc, python_frame.schema)
        scala_rdd = self._tc.sc._jvm.org.trustedanalytics.sparktk.frame.rdd.PythonJavaRdd.pythonToScala(python_frame.rdd._jrdd, scala_schema)
        return self._create_scala_frame(self._tc.sc, scala_rdd, scala_schema)

    def _is_scala_frame(self, item):
        return self._tc._jutils.is_jvm_instance_of(item, self._tc.sc._jvm.org.trustedanalytics.sparktk.frame.Frame)

    def _is_scala_rdd(self, item):
        return self._tc._jutils.is_jvm_instance_of(item, self._tc.sc._jvm.org.apache.spark.rdd.RDD)

    def _is_scala_dataframe(self, item):
        return self._tc._jutils.is_jvm_instance_of(item, self._tc.sc._jvm.org.apache.spark.sql.DataFrame)

    def _is_python_rdd(self, item):
        return isinstance(item, RDD)

    @property
    def _is_scala(self):
        """answers whether the current frame is backed by a Scala Frame"""
        return self._is_scala_frame(self._frame)

    @property
    def _is_python(self):
        """answers whether the current frame is backed by a _PythonFrame"""
        return not self._is_scala

    @property
    def _scala(self):
        """gets frame backend as Scala Frame, causes conversion if it is current not"""
        if self._is_python:
            # If schema contains matrix dataype,
            # then apply type_coercer_pymlib to convert ndarray to pymlib DenseMatrix for serialization purpose at java
            self._frame.rdd = MatrixCoercion.schema_is_coercible(self._frame.rdd, list(self._frame.schema), True)
            # convert PythonFrame to a Scala Frame"""
            scala_schema = schema_to_scala(self._tc.sc, self._frame.schema)
            scala_rdd = self._tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.pythonToScala(self._frame.rdd._jrdd, scala_schema)
            self._frame = self._create_scala_frame(self._tc.sc, scala_rdd, scala_schema)
        return self._frame

    @property
    def _python(self):
        """gets frame backend as _PythonFrame, causes conversion if it is current not"""
        if self._is_scala:
            # convert Scala Frame to a PythonFrame"""
            scala_schema = self._frame.schema()
            java_rdd =  self._tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.scalaToPython(self._frame.rdd())
            python_schema = schema_to_python(self._tc.sc, scala_schema)
            python_rdd = RDD(java_rdd, self._tc.sc)
            # If schema contains matrix datatype, then apply type_coercer to convert list[list] to numpy ndarray
            map_python_rdd = MatrixCoercion.schema_is_coercible(python_rdd, list(python_schema))
            self._frame = PythonFrame(map_python_rdd, python_schema)
        return self._frame

    ##########################################################################
    # API
    ##########################################################################

    @property
    def rdd(self):
        """pyspark RDD  (causes conversion if currently backed by a Scala RDD)"""
        return self._python.rdd

    @property
    def dataframe(self):
        """pyspark DataFrame (causes conversion through Scala)"""
        return DataFrame(self._scala.dataframe(), self._tc.sql_context)

    @property
    def schema(self):
        if self._is_scala:
            return schema_to_python(self._tc.sc, self._frame.schema())  # need ()'s on schema because it's a def in scala
        return self._frame.schema

    @property
    def column_names(self):
        """
        Column identifications in the current frame.

        :return: list of names of all the frame's columns

        Returns the names of the columns of the current frame.

        Examples
        --------

            >>> frame.column_names
            [u'name', u'age', u'tenure', u'phone']


        """
        return [name for name, data_type in self.schema]

    # Frame Operations

    from sparktk.frame.ops.add_columns import add_columns
    from sparktk.frame.ops.append import append
    from sparktk.frame.ops.assign_sample import assign_sample
    from sparktk.frame.ops.bin_column import bin_column
    from sparktk.frame.ops.binary_classification_metrics import binary_classification_metrics
    from sparktk.frame.ops.box_cox import box_cox
    from sparktk.frame.ops.categorical_summary import categorical_summary
    from sparktk.frame.ops.collect import collect
    from sparktk.frame.ops.column_median import column_median
    from sparktk.frame.ops.column_mode import column_mode
    from sparktk.frame.ops.column_summary_statistics import column_summary_statistics
    from sparktk.frame.ops.copy import copy
    from sparktk.frame.ops.correlation import correlation
    from sparktk.frame.ops.correlation_matrix import correlation_matrix
    from sparktk.frame.ops.count import count
    from sparktk.frame.ops.covariance import covariance
    from sparktk.frame.ops.covariance_matrix import covariance_matrix
    from sparktk.frame.ops.cumulative_percent import cumulative_percent
    from sparktk.frame.ops.cumulative_sum import cumulative_sum
    from sparktk.frame.ops.dot_product import dot_product
    from sparktk.frame.ops.drop_columns import drop_columns
    from sparktk.frame.ops.drop_duplicates import drop_duplicates
    from sparktk.frame.ops.drop_rows import drop_rows
    from sparktk.frame.ops.ecdf import ecdf
    from sparktk.frame.ops.entropy import entropy
    from sparktk.frame.ops.export_data import export_to_csv, export_to_jdbc, export_to_json, export_to_hbase, export_to_hive
    from sparktk.frame.ops.filter import filter
    from sparktk.frame.ops.flatten_columns import flatten_columns
    from sparktk.frame.ops.group_by import group_by
    from sparktk.frame.ops.histogram import histogram
    from sparktk.frame.ops.inspect import inspect
    from sparktk.frame.ops.join_inner import join_inner
    from sparktk.frame.ops.join_left import join_left
    from sparktk.frame.ops.join_right import join_right
    from sparktk.frame.ops.join_outer import join_outer
    from sparktk.frame.ops.map_columns import map_columns
    from sparktk.frame.ops.matrix_covariance_matrix import matrix_covariance_matrix
    from sparktk.frame.ops.matrix_pca import matrix_pca
    from sparktk.frame.ops.matrix_svd import matrix_svd
    from sparktk.frame.ops.multiclass_classification_metrics import multiclass_classification_metrics
    from sparktk.frame.ops.power_iteration_clustering import power_iteration_clustering
    from sparktk.frame.ops.quantile_bin_column import quantile_bin_column
    from sparktk.frame.ops.quantiles import quantiles
    from sparktk.frame.ops.rename_columns import rename_columns
    from sparktk.frame.ops.reverse_box_cox import reverse_box_cox
    from sparktk.frame.ops.save import save
    from sparktk.frame.ops.sort import sort
    from sparktk.frame.ops.sortedk import sorted_k
    from sparktk.frame.ops.take import take
    from sparktk.frame.ops.tally import tally
    from sparktk.frame.ops.tally_percent import tally_percent
    from sparktk.frame.ops.timeseries_augmented_dickey_fuller_test import timeseries_augmented_dickey_fuller_test
    from sparktk.frame.ops.timeseries_breusch_godfrey_test import timeseries_breusch_godfrey_test
    from sparktk.frame.ops.timeseries_breusch_pagan_test import timeseries_breusch_pagan_test
    from sparktk.frame.ops.timeseries_durbin_watson_test import timeseries_durbin_watson_test
    from sparktk.frame.ops.timeseries_from_observations import timeseries_from_observations
    from sparktk.frame.ops.timeseries_slice import timeseries_slice
    from sparktk.frame.ops.to_pandas import to_pandas
    from sparktk.frame.ops.topk import top_k
    from sparktk.frame.ops.unflatten_columns import unflatten_columns


def load(path, tc=TkContext.implicit):
    """load Frame from given path"""
    TkContext.validate(tc)
    return tc.load(path, Frame)


class SchemaValidationReturn(PropertiesObject):
    """
    Return value from schema validation that includes the rdd of validated values and the number of bad values
    that were found.
    """

    def __init__(self, validated_rdd, bad_value_count):
        self._validated_rdd = validated_rdd
        self._bad_value_count = bad_value_count

    @property
    def validated_rdd(self):
        """
        RDD of values that have been casted to the data type specified by the frame's schema.
        """
        return self._validated_rdd

    @property
    def bad_value_count(self):
        """
        Number of values that were unable to be parsed to the data type specified by the schema.
        """
        return self._bad_value_count


class MatrixCoercion(object):
    @staticmethod
    def schema_is_coercible(source, python_schema, in_scala=False):
        """
        check whether python schema is coercible or not.
        Like if schema contains matrix datatype, convert list[list] to numpy ndarray
        """
        flag = False
        for schema in python_schema:
            if type(schema[1]) == dtypes._Matrix:
                flag = True
                break

        if flag:
            if in_scala:
                map_source = source.map(MatrixCoercion.type_coercer_pymllib(python_schema))
            else:
                map_source = source.map(MatrixCoercion.type_coercer(python_schema))
        else:
            map_source = source

        return map_source

    @staticmethod
    def type_coercer(schema):
        """
        When creating a new frame(python frame created) or converting frame from scala to python frame,
        the function scans a row and performs below
            * when creating a new frame(python frame created) - if it finds list[list](which represents matrix) as column value,
              converts it to numpy ndarray
            * when Converting frame from scala to python frame - (scala converts DenseMatrix--> JList[JList[Double]](in JConvert.scala),
              jconvert.py converts JList[JList[Double]] --> list[list[float64]]), converts list[list] to ndarray

        """
        def decorator(row):
            result = []
            import numpy as np
            for i in xrange(len(schema)):
                if type(schema[i][1]) == dtypes._Matrix:
                    if isinstance(row[i], list):
                        result.append(np.array(row[i], dtype=np.float64))
                    else:
                        result.append(row[i])
                else:
                    result.append(row[i])
            return result
        return decorator

    @staticmethod
    def type_coercer_pymllib(schema):
        """
        When converting from python to scala, function scans the row and converts the ndarray
        to python mllib DenseMatrix. so that autopicklers understands how to serialize from pyspark mllib DenseMatrix to Scala MLlib DenseMatrix.
        For Serialization to work we have to explicitly call SparkAliases.getSparkMLLibSerDe in pythonToScala() method of PythonJavaRdd.scala class

        ndarray stores data as row-major where as mllib densematrix stores data as column-major.
        To construct mllib DenseMatrix with row-major we are setting isTransposed=True.
        """
        def decorator(row):
            result = []
            from pyspark.mllib.linalg import DenseMatrix
            for i in xrange(len(schema)):
                if type(schema[i][1]) == dtypes._Matrix:
                    shape = row[i].shape
                    arr=row[i].flatten()
                    # By default Mllib DenseMatrix constructs column-major matrix.
                    # Setting isTranposed=True, will construct row-major DenseMatrix
                    dm = DenseMatrix(shape[0], shape[1], arr, isTransposed=True)
                    result.append(dm)
                else:
                    result.append(row[i])
            return result
        return decorator

Functions

def create(

data, schema=None, validate_schema=False, tc=<class 'sparktk.arguments.implicit'>)

Creates a frame from the given data and schema. If no schema data types are provided, the schema is inferred based on the data in the first 100 rows.

If schema validation is enabled, all data is is checked to ensure that it matches the schema. If the data does not match the schema's data type, it attempts to cast the data to the proper data type. When the data is unable to be casted to the schema's data type, the item will be missing (None) in the frame.

Parameters:

data

(List of row data or RDD):

Data source

schema

(Optional(list[tuple(str, type)] or list[str])] Optionally specify a schema (list of tuples of string column names and data type), column names (list of strings, and the column data types will be inferred):

or None (column data types will be inferred and column names will be numbered like C0, C1, C2, etc).

validate_schema

(Optional(bool)):

When True, all data is checked to ensure that it matches the schema. If the data does not match the schema's data type, it attempts to cast the data to the proper data type. When the data is unable to be casted to the schema's data type, a missing value (None) is inserted in it's place. Defaults to False.

tc:

TkContext

Returns

(Frame):

Frame loaded with the specified data

Examples:

Create a frame with the specified data.

>>> data = [["Bob", 30, 8], ["Jim", 45, 9.5], ["Sue", 25, 7], ["George", 15, 6], ["Jennifer", 18, 8.5]]
>>> frame = tc.frame.create(data)

Since no schema is provided, the schema will be inferred. Note that the data set had a mix of strings and integers in the third column. The schema will use the most general data type from the data that it sees, so in this example, the column is treated as a float.

>>> frame.schema
[('C0', <type 'str'>), ('C1', <type 'int'>), ('C2', <type 'float'>)]

>>> frame.inspect()
[#]  C0        C1  C2
======================
[0]  Bob       30    8
[1]  Jim       45  9.5
[2]  Sue       25    7
[3]  George    15    6
[4]  Jennifer  18  8.5

We could also enable schema validation, which checks the data against the schema. If the data does not match the schema's data type, it attempts to cast the data to the proper data type.

>>> frame = tc.frame.create(data, validate_schema=True)

In this example with schema validation enabled, the integers in column C2 get casted to floats:

>>> frame.inspect()
[#]  C0        C1  C2
======================
[0]  Bob       30  8.0
[1]  Jim       45  9.5
[2]  Sue       25  7.0
[3]  George    15  6.0
[4]  Jennifer  18  8.5

We could also provide a list of column names when creating the frame. When a list of column names is provided, the data types for the schema are still inferred, but the columns in the schema are labeled with the specified names.

>>> frame = tc.frame.create(data, schema=["name", "age", "shoe_size"], validate_schema=True)

>>> frame.schema
[('name', <type 'str'>), ('age', <type 'int'>), ('shoe_size', <type 'float'>)]

>>> frame.inspect()
[#]  name      age  shoe_size
=============================
[0]  Bob        30        8.0
[1]  Jim        45        9.5
[2]  Sue        25        7.0
[3]  George     15        6.0
[4]  Jennifer   18        8.5

Note that if a value cannot be parsed as the specified data type in the schema, it will show up as missing (None), if validate_schema is enabled. For example, consider the following frame where columns are defined as integers, but the data specified has a string in the second row.

>>> data = [[1, 2, 3], [4, "five", 6]]
>>> schema = [("a", int), ("b", int), ("c", int)]

>>> frame = tc.frame.create(data, schema, validate_schema = True)

>>> frame.inspect()
[#]  a  b     c
===============
[0]  1     2  3
[1]  4  None  6

Note that the spot where the string was located, has it's value missing (None) since it couldn't be parsed to an integer. If validate_schema was disabled, no attempt is made to parse the data to the data type specified by the schema, and further frame operations may fail due to the data type discrepancy.

Show source ≡

def create(data, schema=None, validate_schema=False, tc=TkContext.implicit):
    """
    Creates a frame from the given data and schema.  If no schema data types are provided, the schema is inferred
    based on the data in the first 100 rows.

    If schema validation is enabled, all data is is checked to ensure that it matches the schema.  If the data does
    not match the schema's data type, it attempts to cast the data to the proper data type.  When the data is unable
    to be casted to the schema's data type, the item will be missing (None) in the frame.

    Parameters
    ----------

    :param data: (List of row data or RDD) Data source
    :param schema: (Optional(list[tuple(str, type)] or list[str])] Optionally specify a schema (list of tuples of
                   string column names and data type), column names (list of strings, and the column data types will
                   be inferred) or None (column data types will be inferred and column names will be numbered like C0,
                   C1, C2, etc).
    :param validate_schema: (Optional(bool)) When True, all data is checked to ensure that it matches the schema.
                            If the data does not match the schema's data type, it attempts to cast the data to the
                            proper data type.  When the data is unable to be casted to the schema's data type, a
                            missing value (None) is inserted in it's place. Defaults to False.
    :param tc: TkContext
    :return: (Frame) Frame loaded with the specified data


    Examples
    --------

    Create a frame with the specified data.

        >>> data = [["Bob", 30, 8], ["Jim", 45, 9.5], ["Sue", 25, 7], ["George", 15, 6], ["Jennifer", 18, 8.5]]
        >>> frame = tc.frame.create(data)

    Since no schema is provided, the schema will be inferred.  Note that the data set had a mix of strings and
    integers in the third column.  The schema will use the most general data type from the data that it sees, so in
    this example, the column is treated as a float.

        >>> frame.schema
        [('C0', ), ('C1', ), ('C2', )]

        >>> frame.inspect()
        [#]  C0        C1  C2
        ======================
        [0]  Bob       30    8
        [1]  Jim       45  9.5
        [2]  Sue       25    7
        [3]  George    15    6
        [4]  Jennifer  18  8.5

    We could also enable schema validation, which checks the data against the schema.  If the data does not match the
    schema's data type, it attempts to cast the data to the proper data type.

        >>> frame = tc.frame.create(data, validate_schema=True)

    In this example with schema validation enabled, the integers in column C2 get casted to floats:

        >>> frame.inspect()
        [#]  C0        C1  C2
        ======================
        [0]  Bob       30  8.0
        [1]  Jim       45  9.5
        [2]  Sue       25  7.0
        [3]  George    15  6.0
        [4]  Jennifer  18  8.5

    We could also provide a list of column names when creating the frame.  When a list of column names is provided,
    the data types for the schema are still inferred, but the columns in the schema are labeled with the specified names.

        >>> frame = tc.frame.create(data, schema=["name", "age", "shoe_size"], validate_schema=True)

        >>> frame.schema
        [('name', ), ('age', ), ('shoe_size', )]

        >>> frame.inspect()
        [#]  name      age  shoe_size
        =============================
        [0]  Bob        30        8.0
        [1]  Jim        45        9.5
        [2]  Sue        25        7.0
        [3]  George     15        6.0
        [4]  Jennifer   18        8.5

    Note that if a value cannot be parsed as the specified data type in the schema, it will show up as missing (None),
    if validate_schema is enabled.  For example, consider the following frame where columns are defined as integers,
    but the data specified has a string in the second row.

        >>> data = [[1, 2, 3], [4, "five", 6]]
        >>> schema = [("a", int), ("b", int), ("c", int)]

        >>> frame = tc.frame.create(data, schema, validate_schema = True)

        >>> frame.inspect()
        [#]  a  b     c
        ===============
        [0]  1     2  3
        [1]  4  None  6

    Note that the spot where the string was located, has it's value missing (None) since it couldn't be parsed to an
    integer.  If validate_schema was disabled, no attempt is made to parse the data to the data type specified by the
    schema, and further frame operations may fail due to the data type discrepancy.

    """
    TkContext.validate(tc)
    if data is None:
        data = []
    if not isinstance(data, list)\
            and not isinstance(data, (RDD, DataFrame))\
            and not tc._jutils.is_jvm_instance_of(data, tc.sc._jvm.org.apache.spark.rdd.RDD)\
            and not tc._jutils.is_jvm_instance_of(data, tc.sc._jvm.org.apache.spark.sql.DataFrame):
        raise TypeError("Invalid data source. Expected the data parameter to be a 2-dimensional list (list of row data) or an RDD or DataFrame, but received: %s" % type(data))
    from sparktk.frame.frame import Frame
    return Frame(tc, data, schema, validate_schema)

def import_csv(

path, delimiter=',', header=False, infer_schema=True, schema=None, datetime_format="yyyy-MM-dd'T'HH:mm:ss.SSSX", tc=<class 'sparktk.arguments.implicit'>)

Creates a frame with data from a csv file.

Parameters:

path

(str):

Full path to the csv file

delimiter

(Optional[str]):

A string which indicates the separation of data fields. This is usually a single character and could be a non-visible character, such as a tab. The default delimiter is a comma (,).

header

(Optional[bool]):

Boolean value indicating if the first line of the file will be used to name columns, and not be included in the data. The default value is false.

:param infer_schema:(Optional[bool]) Boolean value indicating if the column types will be automatically inferred. It requires one extra pass over the data and is false by default.

schema

(Optional[List[tuple(str, type)]]):

Optionally specify the schema for the dataset. Number of columns specified in the schema must match the number of columns in the csv file provided. If the value from the csv file cannot be converted to the data type specified by the schema (for example, if the csv file has a string, and the schema specifies an int), the value will show up as missing (None) in the frame.

datetime_format

(str):

String specifying how date/time columns are formatted, using the java.text.SimpleDateFormat specified at https://docs.oracle.com/javase/7/docs/api/java/text/SimpleDateFormat.html

Returns

(Frame):

Frame that contains the data from the csv file

Examples:

Load a frame from a csv file by specifying the path to the file, delimiter, and options that specify that there is a header and to infer the schema based on the data.

>>> file_path = "../datasets/cities.csv"

>>> frame = tc.frame.import_csv(file_path, "|", header=True, infer_schema=True)
-etc-

>>> frame.inspect()
[#]  rank  city         population_2013  population_2010  change  county
============================================================================
[0]     1  Portland              609456           583776  4.40%   Multnomah
[1]     2  Salem                 160614           154637  3.87%   Marion
[2]     3  Eugene                159190           156185  1.92%   Lane
[3]     4  Gresham               109397           105594  3.60%   Multnomah
[4]     5  Hillsboro              97368            91611  6.28%   Washington
[5]     6  Beaverton              93542            89803  4.16%   Washington
[6]    15  Grants Pass            35076            34533  1.57%   Josephine
[7]    16  Oregon City            34622            31859  8.67%   Clackamas
[8]    17  McMinnville            33131            32187  2.93%   Yamhill
[9]    18  Redmond                27427            26215  4.62%   Deschutes

>>> frame.schema
[('rank', <type 'int'>), ('city', <type 'str'>), ('population_2013', <type 'int'>), ('population_2010', <type 'int'>), ('change', <type 'str'>), ('county', <type 'str'>)]

Show source ≡

def import_csv(path, delimiter=",", header=False, infer_schema=True, schema=None, datetime_format="yyyy-MM-dd'T'HH:mm:ss.SSSX", tc=TkContext.implicit):
    """
    Creates a frame with data from a csv file.

    Parameters
    ----------

    :param path: (str) Full path to the csv file
    :param delimiter: (Optional[str]) A string which indicates the separation of data fields.  This is usually a
                      single character and could be a non-visible character, such as a tab. The default delimiter
                      is a comma (,).
    :param header: (Optional[bool]) Boolean value indicating if the first line of the file will be used to name columns,
                   and not be included in the data.  The default value is false.
    :param infer_schema:(Optional[bool]) Boolean value indicating if the column types will be automatically inferred.
                       It requires one extra pass over the data and is false by default.
    :param schema: (Optional[List[tuple(str, type)]]) Optionally specify the schema for the dataset.  Number of
                    columns specified in the schema must match the number of columns in the csv file provided.  If the
                    value from the csv file cannot be converted to the data type specified by the schema (for example,
                    if the csv file has a string, and the schema specifies an int), the value will show up as missing
                    (None) in the frame.
    :param datetime_format: (str) String specifying how date/time columns are formatted, using the java.text.SimpleDateFormat
                        specified at https://docs.oracle.com/javase/7/docs/api/java/text/SimpleDateFormat.html
    :return: (Frame) Frame that contains the data from the csv file

    Examples
    --------
    Load a frame from a csv file by specifying the path to the file, delimiter, and options that specify that
    there is a header and to infer the schema based on the data.


        >>> file_path = "../datasets/cities.csv"

        >>> frame = tc.frame.import_csv(file_path, "|", header=True, infer_schema=True)
        -etc-

        >>> frame.inspect()
        [#]  rank  city         population_2013  population_2010  change  county
        ============================================================================
        [0]     1  Portland              609456           583776  4.40%   Multnomah
        [1]     2  Salem                 160614           154637  3.87%   Marion
        [2]     3  Eugene                159190           156185  1.92%   Lane
        [3]     4  Gresham               109397           105594  3.60%   Multnomah
        [4]     5  Hillsboro              97368            91611  6.28%   Washington
        [5]     6  Beaverton              93542            89803  4.16%   Washington
        [6]    15  Grants Pass            35076            34533  1.57%   Josephine
        [7]    16  Oregon City            34622            31859  8.67%   Clackamas
        [8]    17  McMinnville            33131            32187  2.93%   Yamhill
        [9]    18  Redmond                27427            26215  4.62%   Deschutes

        >>> frame.schema
        [('rank', ), ('city', ), ('population_2013', ), ('population_2010', ), ('change', ), ('county', )]


    """


    if schema is not None:
        infer_schema = False   # if a custom schema is provided, don't waste time inferring the schema during load
        sparktk_schema.validate(schema)
    if not isinstance(header, bool):
        raise ValueError("header parameter must be a boolean, but is {0}.".format(type(header)))
    if not isinstance(infer_schema, bool):
        raise ValueError("infer_schema parameter must be a boolean, but is {0}.".format(type(infer_schema)))
    TkContext.validate(tc)

    header_str = str(header).lower()
    infer_schema_str = str(infer_schema).lower()
    pyspark_schema = None

    if (not infer_schema) and (schema is not None):
        fields = []
        for column in schema:
            if dtypes._data_type_to_pyspark_type_table.has_key(column[1]):
                fields.append(StructField(column[0], dtypes._data_type_to_pyspark_type_table[column[1]], True))
            else:
                raise TypeError("Unsupported type {0} in schema for column {1}.".format(column[1], column[0]))
        pyspark_schema = StructType(fields)

    df = tc.sql_context.read.format(
        "com.databricks.spark.csv.org.trustedanalytics.sparktk").options(
            delimiter=delimiter,
            header=header_str,
            dateformat=datetime_format,
            inferschema=infer_schema_str).load(path, schema=pyspark_schema)

    df_schema = []

    if schema is None:
        for column in df.schema.fields:
            try:
                datatype = dtypes.dtypes.get_primitive_type_from_pyspark_type(type(column.dataType))
            except ValueError:
                raise TypeError("Unsupported data type ({0}) for column {1}.".format(str(column.dataType), column.name))
            df_schema.append((column.name, datatype))
    else:
        df_column_count = len(df.schema.fields)
        custom_column_count = len(schema)
        if (df_column_count != custom_column_count):
            raise ValueError("Bad schema value.  The number of columns in the custom schema ({0}) must match the"
                             "number of columns in the csv file data ({1}).".format(custom_column_count, df_column_count))
        df_schema = schema

    def cast_datetime(row):
        """
        The spark data frame gives uses datetime objects.  Convert them to long (ms since epoch) for our frame.
        """
        data = []
        for column_index in xrange(0, len(df_schema)):
            if df_schema[column_index][1] == dtypes.datetime and isinstance(row[column_index], datetime):
                data.append(long(dtypes.datetime_to_ms(row[column_index])))
            else:
                data.append(row[column_index])
        return data

    jrdd = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.scalaToPython(df._jdf.rdd())
    rdd = RDD(jrdd, tc.sc)

    if any(c[1] == dtypes.datetime for c in df_schema):
        # If any columns are date/time we must do this map
        rdd = df.rdd.map(cast_datetime)

    from sparktk.frame.frame import Frame  # circular dependency, so import late
    return Frame(tc, rdd, df_schema)

def import_hbase(

table_name, schema, start_tag=None, end_tag=None, tc=<class 'sparktk.arguments.implicit'>)

Import data from hbase table into frame

Parameters:

table_name

(str):

hbase table name

schema

(list[list[str, str, type]]):

hbase schema as a List of List(string) (columnFamily, columnName, dataType for cell value)

start_tag

(Optional(str)):

optional start tag for filtering

end_tag

(Optional(str)):

optional end tag for filtering

Returns

(Frame):

frame with data from hbase table

Example:

Load data into frame from a hbase table

>>> frame = tc.frame.import_hbase("demo_test_hbase", [["test_family", "a", int],["test_family", "b", float], ["test_family", "c", int],["test_family", "d", int]])
-etc-
>>> frame.inspect()
[#]  test_family_a  test_family_b  test_family_c  test_family_d
===============================================================
[0]              1            0.2             -2              5
[1]              2            0.4             -1              6
[2]              3            0.6              0              7
[3]              4            0.8              1              8

Use of start_tag and end_tag. (Hbase creates a unique row id for data in hbase tables)
start_tag: It is the unique row id from where row scan should start
end_tag: It is the unique row id where row scan should end

Assuming you already have data on hbase table "test_startendtag" under "startendtag" family name with single column named "number".
data: column contains values from 1 to 99. Here rowid is generated by hbase.

Sample hbase data. Few rows from hbase table looks as below.
hbase(main):002:0> scan "test_startendtag"
ROW             COLUMN+CELL
 0          column=startendtag:number, timestamp=1465342524846, value=1
 1          column=startendtag:number, timestamp=1465342524846, value=25
 10         column=startendtag:number, timestamp=1465342524847, value=51
 103        column=startendtag:number, timestamp=1465342524851, value=98
 107        column=startendtag:number, timestamp=1465342524851, value=99
 11         column=startendtag:number, timestamp=1465342524851, value=75
 12         column=startendtag:number, timestamp=1465342524846, value=4
 13         column=startendtag:number, timestamp=1465342524846, value=28
 14         column=startendtag:number, timestamp=1465342524847, value=52
 15         column=startendtag:number, timestamp=1465342524851, value=76
 16         column=startendtag:number, timestamp=1465342524846, value=5
 17         column=startendtag:number, timestamp=1465342524846, value=29
 18         column=startendtag:number, timestamp=1465342524847, value=53
 19         column=startendtag:number, timestamp=1465342524851, value=77
 2          column=startendtag:number, timestamp=1465342524847, value=49
 20         column=startendtag:number, timestamp=1465342524846, value=6
 21         column=startendtag:number, timestamp=1465342524846, value=30

>>> frame = tc.frame.import_hbase("test_startendtag", [["startendtag", "number", int]], start_tag="20", end_tag="50")
-etc-
>>> frame.count()
33
>>> frame.inspect(33)
[##]  startendtag_number
========================
[0]                    6
[1]                   30
[2]                   54
[3]                   78
[4]                    7
[5]                   31
[6]                   55
[7]                   79
[8]                    8
[9]                   32
[10]                  73
[11]                  56
[12]                  80
[13]                   9
[14]                  33
[15]                  57
[16]                  81
[17]                  10
[18]                  34
[19]                  58


[##]  startendtag_number
========================
[20]                  82
[21]                   2
[22]                  11
[23]                  35
[24]                  59
[25]                  83
[26]                  12
[27]                  36
[28]                  60
[29]                  84
[30]                  13
[31]                  37
[32]                  26

Show source ≡

def import_hbase(table_name, schema, start_tag=None, end_tag=None, tc=TkContext.implicit):
    """
    Import data from hbase table into frame

    Parameters
    ----------

    :param table_name: (str) hbase table name
    :param schema: (list[list[str, str, type]]) hbase schema as a List of List(string) (columnFamily, columnName,
                   dataType for cell value)
    :param start_tag: (Optional(str)) optional start tag for filtering
    :param end_tag: (Optional(str)) optional end tag for filtering
    :return: (Frame) frame with data from hbase table

    Example
    ---------
    Load data into frame from a hbase table

        >>> frame = tc.frame.import_hbase("demo_test_hbase", [["test_family", "a", int],["test_family", "b", float], ["test_family", "c", int],["test_family", "d", int]])
        -etc-
        >>> frame.inspect()
        [#]  test_family_a  test_family_b  test_family_c  test_family_d
        ===============================================================
        [0]              1            0.2             -2              5
        [1]              2            0.4             -1              6
        [2]              3            0.6              0              7
        [3]              4            0.8              1              8

        Use of start_tag and end_tag. (Hbase creates a unique row id for data in hbase tables)
        start_tag: It is the unique row id from where row scan should start
        end_tag: It is the unique row id where row scan should end

        Assuming you already have data on hbase table "test_startendtag" under "startendtag" family name with single column named "number".
        data: column contains values from 1 to 99. Here rowid is generated by hbase.

        Sample hbase data. Few rows from hbase table looks as below.
        hbase(main):002:0> scan "test_startendtag"
        ROW             COLUMN+CELL
         0          column=startendtag:number, timestamp=1465342524846, value=1
         1          column=startendtag:number, timestamp=1465342524846, value=25
         10         column=startendtag:number, timestamp=1465342524847, value=51
         103        column=startendtag:number, timestamp=1465342524851, value=98
         107        column=startendtag:number, timestamp=1465342524851, value=99
         11         column=startendtag:number, timestamp=1465342524851, value=75
         12         column=startendtag:number, timestamp=1465342524846, value=4
         13         column=startendtag:number, timestamp=1465342524846, value=28
         14         column=startendtag:number, timestamp=1465342524847, value=52
         15         column=startendtag:number, timestamp=1465342524851, value=76
         16         column=startendtag:number, timestamp=1465342524846, value=5
         17         column=startendtag:number, timestamp=1465342524846, value=29
         18         column=startendtag:number, timestamp=1465342524847, value=53
         19         column=startendtag:number, timestamp=1465342524851, value=77
         2          column=startendtag:number, timestamp=1465342524847, value=49
         20         column=startendtag:number, timestamp=1465342524846, value=6
         21         column=startendtag:number, timestamp=1465342524846, value=30

        >>> frame = tc.frame.import_hbase("test_startendtag", [["startendtag", "number", int]], start_tag="20", end_tag="50")
        -etc-
        >>> frame.count()
        33
        >>> frame.inspect(33)
        [##]  startendtag_number
        ========================
        [0]                    6
        [1]                   30
        [2]                   54
        [3]                   78
        [4]                    7
        [5]                   31
        [6]                   55
        [7]                   79
        [8]                    8
        [9]                   32
        [10]                  73
        [11]                  56
        [12]                  80
        [13]                   9
        [14]                  33
        [15]                  57
        [16]                  81
        [17]                  10
        [18]                  34
        [19]                  58


        [##]  startendtag_number
        ========================
        [20]                  82
        [21]                   2
        [22]                  11
        [23]                  35
        [24]                  59
        [25]                  83
        [26]                  12
        [27]                  36
        [28]                  60
        [29]                  84
        [30]                  13
        [31]                  37
        [32]                  26


    """

    if not isinstance(table_name, basestring):
        raise ValueError("table name parameter must be a string, but is {0}.".format(type(table_name)))
    if not isinstance(schema, list):
        raise ValueError("schema parameter must be a list, but is {0}.".format(type(table_name)))
    TkContext.validate(tc)

    inner_lists=[tc._jutils.convert.to_scala_list([item[0], item[1], dtypes.to_string(item[2])]) for item in schema]
    scala_final_schema = tc.jutils.convert.to_scala_list(inner_lists)

    scala_frame = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.constructors.Import.importHbase(tc.jutils.get_scala_sc(),
                                                                                                         table_name, scala_final_schema,
                                                                                                         tc._jutils.convert.to_scala_option(start_tag),
                                                                                                         tc._jutils.convert.to_scala_option(end_tag))

    from sparktk.frame.frame import Frame
    return Frame(tc, scala_frame)

def import_hive(

hive_query, tc=<class 'sparktk.arguments.implicit'>)

Import data from hive table into frame.

Define the sql query to retrieve the data from a hive table.

Only a subset of Hive data types are supported:

DataType    Support
----------  ------------------------------------
boolean     cast to int

bigint      native support
int         native support
tinyint     cast to int
smallint    cast to int

decimal     cast to double, may lose precision
double      native support
float       native support

date        cast to string
string      native support
timestamp   cast to string
varchar     cast to string

arrays      not supported
binary      not supported
char        not supported
maps        not supported
structs     not supported
union       not supported

Parameters:

hive_query

(str):

hive query to fetch data from table

(TkContext):

TK context

Returns

(Frame):

returns frame with hive table data

Examples:

Load data into frame from a hive table based on hive query

>>> h_query = "select * from demo_test"
>>> frame = tc.frame.import_hive(h_query)
-etc-

>>> frame.inspect()
[#]  number  strformat
======================
[0]       1  one
[1]       2  two
[2]       3  three
[3]       4  four

Show source ≡

def import_hive(hive_query, tc=TkContext.implicit):
    """
    Import data from hive table into frame.

    Define the sql query to retrieve the data from a hive table.

    Only a subset of Hive data types are supported:


        DataType    Support
        ----------  ------------------------------------
        boolean     cast to int

        bigint      native support
        int         native support
        tinyint     cast to int
        smallint    cast to int

        decimal     cast to double, may lose precision
        double      native support
        float       native support

        date        cast to string
        string      native support
        timestamp   cast to string
        varchar     cast to string

        arrays      not supported
        binary      not supported
        char        not supported
        maps        not supported
        structs     not supported
        union       not supported


    Parameters
    ----------

    :param hive_query: (str) hive query to fetch data from table
    :param tc: (TkContext) TK context
    :return: (Frame) returns frame with hive table data

    Examples
    --------
    Load data into frame from a hive table based on hive query

        >>> h_query = "select * from demo_test"
        >>> frame = tc.frame.import_hive(h_query)
        -etc-

        >>> frame.inspect()
        [#]  number  strformat
        ======================
        [0]       1  one
        [1]       2  two
        [2]       3  three
        [3]       4  four

    """
    if not isinstance(hive_query, basestring):
        raise ValueError("hive query parameter must be a string, but is {0}.".format(type(hive_query)))
    TkContext.validate(tc)

    scala_frame = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.constructors.Import.importHive(tc.jutils.get_scala_sc(), hive_query)

    from sparktk.frame.frame import Frame
    return Frame(tc, scala_frame)

def import_jdbc(

connection_url, table_name, tc=<class 'sparktk.arguments.implicit'>)

Import data from jdbc table into frame.

Parameters:

connection_url

(str):

JDBC connection url to database server

table_name

(str):

JDBC table name

Returns

(Frame):

returns frame with jdbc table data

Examples:

Load a frame from a jdbc table specifying the connection url to the database server.

>>> url = "jdbc:postgresql://localhost/postgres"
>>> tb_name = "demo_test"

>>> frame = tc.frame.import_jdbc(url, tb_name)
-etc-

>>> frame.inspect()
[#]  a  b    c   d
==================
[0]  1  0.2  -2  5
[1]  2  0.4  -1  6
[2]  3  0.6   0  7
[3]  4  0.8   1  8

>>> frame.schema
[(u'a', int), (u'b', float), (u'c', int), (u'd', int)]

Show source ≡

def import_jdbc(connection_url, table_name, tc=TkContext.implicit):
    """
    Import data from jdbc table into frame.

    Parameters
    ----------

    :param connection_url: (str) JDBC connection url to database server
    :param table_name: (str) JDBC table name
    :return: (Frame) returns frame with jdbc table data

    Examples
    --------
    Load a frame from a jdbc table specifying the connection url to the database server.

        >>> url = "jdbc:postgresql://localhost/postgres"
        >>> tb_name = "demo_test"

        >>> frame = tc.frame.import_jdbc(url, tb_name)
        -etc-

        >>> frame.inspect()
        [#]  a  b    c   d
        ==================
        [0]  1  0.2  -2  5
        [1]  2  0.4  -1  6
        [2]  3  0.6   0  7
        [3]  4  0.8   1  8

        >>> frame.schema
        [(u'a', int), (u'b', float), (u'c', int), (u'd', int)]
    """
    if not isinstance(connection_url, basestring):
        raise ValueError("connection url parameter must be a string, but is {0}.".format(type(connection_url)))
    if not isinstance(table_name, basestring):
        raise ValueError("table name parameter must be a string, but is {0}.".format(type(table_name)))
    TkContext.validate(tc)

    scala_frame = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.constructors.Import.importJdbc(tc.jutils.get_scala_sc(), connection_url, table_name)

    from sparktk.frame.frame import Frame
    return Frame(tc, scala_frame)

def import_pandas(

pandas_frame, schema=None, row_index=True, validate_schema=False, tc=<class 'sparktk.arguments.implicit'>)

Imports data from the specified pandas data frame.

Parameters:

pandas_frame

(pandas.DataFrame):

pandas dataframe object

schema

(Optional(list[tuples(string, type)])):

Schema description of the fields for a given line. It is a list of tuples which describe each field, (field name, field type), where the field name is a string, and file is a supported type. If no schema is provided, the schema will be inferred based on the column names and types from the pandas_frame.

row_index

(Optional(bool)):

Indicates if the row_index is present in the pandas dataframe and needs to be ignored when looking at the data values. Default value is True.

validate_schema

(Optional(bool)):

If true, validates the data against the schema and attempts to cast the data to the specified type, if it does not match the schema. Defaults to False.

Returns

(Frame):

spark-tk frame that contains data from the pandas_frame

Examples:

Create a pandas data frame:

>>> import pandas
>>> ratings_data = [[0, "invalid"], [1, "Very Poor"], [2, "Poor"], [3, "Average"], [4, "Good"], [5, "Very Good"]]
>>> df = pandas.DataFrame(ratings_data, columns=['rating_id', 'rating_text'])

>>> df
   rating_id rating_text
0          0     invalid
1          1   Very Poor
2          2        Poor
3          3     Average
4          4        Good
5          5   Very Good

>>> df.columns.tolist()
['rating_id', 'rating_text']

>>> df.dtypes
rating_id       int64
rating_text    object
dtype: object

When using import_pandas by just passing the pandas data frame, it will use the column names and types from the pandas data frame to generate the schema.

>>> frame = tc.frame.import_pandas(df)

>>> frame.inspect()
[#]  rating_id  rating_text
===========================
[0]          0  invalid
[1]          1  Very Poor
[2]          2  Poor
[3]          3  Average
[4]          4  Good
[5]          5  Very Good

>>> frame.schema
[('rating_id', <type 'long'>), ('rating_text', <type 'str'>)]

Alternatively, you can specify a schema when importing the pandas data frame. There is also the option to validate the data against the schema. If this option is enabled, we will attempt to cast the data to the column's data type, if it does not match the schema.

For example, here we will specify a schema where the rating_id column will instead be called 'rating_float' and it's data type will be a float. We will also enable the validate_schema option so that the rating_id value will get casted to a float: >>> schema = [("rating_float", float), ("rating_str", unicode)] >>> frame = tc.frame.import_pandas(df, schema, validate_schema=True)

>>> frame.inspect()
[#]  rating_float  rating_str
=============================
[0]           0.0  invalid
[1]           1.0  Very Poor
[2]           2.0  Poor
[3]           3.0  Average
[4]           4.0  Good
[5]           5.0  Very Good

>>> frame.schema
[('rating_float', <type 'float'>), ('rating_str', <type 'unicode'>)]

Show source ≡

def import_pandas(pandas_frame, schema=None, row_index=True, validate_schema=False, tc=TkContext.implicit):
    """
    Imports data from the specified pandas data frame.

    Parameters
    ----------

    :param pandas_frame: (pandas.DataFrame)  pandas dataframe object
    :param schema: (Optional(list[tuples(string, type)])) Schema description of the fields for a given line.  It is a
                   list of tuples which describe each field, (field name, field type), where the field name is a
                   string, and file is a supported type.  If no schema is provided, the schema will be inferred based
                   on the column names and types from the pandas_frame.
    :param row_index: (Optional(bool)) Indicates if the row_index is present in the pandas dataframe and needs to be
                      ignored when looking at the data values. Default value is True.
    :param validate_schema: (Optional(bool)) If true, validates the data against the schema and attempts to cast the
                            data to the specified type, if it does not match the schema.  Defaults to False.
    :return: (Frame) spark-tk frame that contains data from the pandas_frame

    Examples
    --------

    Create a pandas data frame:

        >>> import pandas
        >>> ratings_data = [[0, "invalid"], [1, "Very Poor"], [2, "Poor"], [3, "Average"], [4, "Good"], [5, "Very Good"]]
        >>> df = pandas.DataFrame(ratings_data, columns=['rating_id', 'rating_text'])

        >>> df
           rating_id rating_text
        0          0     invalid
        1          1   Very Poor
        2          2        Poor
        3          3     Average
        4          4        Good
        5          5   Very Good

        >>> df.columns.tolist()
        ['rating_id', 'rating_text']

        >>> df.dtypes
        rating_id       int64
        rating_text    object
        dtype: object

    When using import_pandas by just passing the pandas data frame, it will use the column names and types from the
    pandas data frame to generate the schema.

        >>> frame = tc.frame.import_pandas(df)

        >>> frame.inspect()
        [#]  rating_id  rating_text
        ===========================
        [0]          0  invalid
        [1]          1  Very Poor
        [2]          2  Poor
        [3]          3  Average
        [4]          4  Good
        [5]          5  Very Good

        >>> frame.schema
        [('rating_id', ), ('rating_text', )]

    Alternatively, you can specify a schema when importing the pandas data frame.  There is also the option to validate
    the data against the schema.  If this option is enabled, we will attempt to cast the data to the column's data type,
    if it does not match the schema.

    For example, here we will specify a schema where the rating_id column will instead be called 'rating_float' and it's
    data type will be a float.  We will also enable the validate_schema option so that the rating_id value will get
    casted to a float:
        >>> schema = [("rating_float", float), ("rating_str", unicode)]
        >>> frame = tc.frame.import_pandas(df, schema, validate_schema=True)

        >>> frame.inspect()
        [#]  rating_float  rating_str
        =============================
        [0]           0.0  invalid
        [1]           1.0  Very Poor
        [2]           2.0  Poor
        [3]           3.0  Average
        [4]           4.0  Good
        [5]           5.0  Very Good

        >>> frame.schema
        [('rating_float', ), ('rating_str', )]

    """
    try:
        import pandas
    except:
        raise RuntimeError("pandas module not found, unable to download.  Install pandas or try the take command.")

    if not isinstance(pandas_frame, pandas.DataFrame):
        raise TypeError("data_frame must be a pandas DataFrame.")
    TkContext.validate(tc)
    if schema is not None:
        schema = _validate(schema)
    else:
        schema = _get_schema_from_df(pandas_frame)

    if not row_index:
        pandas_frame = pandas_frame.reset_index()

    pandas_frame = pandas_frame.dropna(thresh=len(pandas_frame.columns))
    field_names = [x[0] for x in schema]
    if len(pandas_frame.columns) != len(field_names):
        raise ValueError("Number of columns in Pandasframe {0} does not match the number of columns in the"
                         " schema provided {1}.".format(len(pandas_frame.columns), len(field_names)))

    date_time_columns = [i for i, x in enumerate(pandas_frame.dtypes) if x == "datetime64[ns]"]
    has_date_time = len(date_time_columns) > 0

    # pandas gives us the date/time in nm or as a Timestamp, and spark-tk expects it as ms, so we need to do the conversion
    def pandas_datetime_to_ms(row):
        for i in date_time_columns:
            if isinstance(row[i], long):
                row[i] = row[i] / 1000000
            elif isinstance(row[i], pandas.tslib.Timestamp) or isinstance(row[i], datetime):
                dt = row[i]
                # get number of seconds since epoch (%s) and multiply by 1000 for ms then get the
                # microseconds to get the ms precision.
                row[i] = long((long(dt.strftime("%s")) * 1000) + (dt.microsecond // 1000))
        return row

    pandas_rows = pandas_frame[0:len(pandas_frame.index)].values.tolist()

    # if the dataframe has date/time columns, map them to ms
    if (has_date_time):
        pandas_rows = map(pandas_datetime_to_ms, pandas_rows)

    # create frame with the pandas_rows
    frame = tc.frame.create(pandas_rows, schema)

    if validate_schema:
        frame = tc.frame.create(frame.rdd, schema, validate_schema)

    return frame

def load(

path, tc=<class 'sparktk.arguments.implicit'>)

load Frame from given path

Show source ≡

def load(path, tc=TkContext.implicit):
    """load Frame from given path"""
    TkContext.validate(tc)
    return tc.load(path, Frame)

Classes

class Frame

Show source ≡

class Frame(object):
    
    def __init__(self, tc, source, schema=None, validate_schema=False):
        """(Private constructor -- use tc.frame.create or other methods available from the TkContext)"""
        self._tc = tc
        if self._is_scala_frame(source):
            self._frame = source
        elif self._is_scala_rdd(source):
            scala_schema = schema_to_scala(tc.sc, schema)
            self._frame = self._create_scala_frame(tc.sc, source, scala_schema)
        elif self._is_scala_dataframe(source):
            self._frame = self._create_scala_frame_from_scala_dataframe(tc.sc, source)
        elif isinstance(source, DataFrame):
            self._frame = self._create_scala_frame_from_scala_dataframe(tc.sc, source._jdf)
        elif isinstance(source, PythonFrame):
            self._frame = source
        else:
            if not isinstance(source, RDD):
                if not isinstance(source, list) or (len(source) > 0 and any(not isinstance(row, (list, tuple)) for row in source)):
                    raise TypeError("Invalid data source.  The data parameter must be a 2-dimensional list (list of row data) or an RDD.")

                inferred_schema = False
                if isinstance(schema, list):
                    if all(isinstance(item, basestring) for item in schema):
                        # check if schema is just a list of column names (versus string and data type tuples)
                        schema = self._infer_schema(source, schema)
                        inferred_schema = True
                    elif not all(isinstance(item, tuple) and
                                  len(item) == 2 and
                                  isinstance(item[0], basestring) for item in schema):
                        raise TypeError("Invalid schema.  Expected a list of tuples (str, type) with the column name and data type, but received type %s." % type(schema))
                    # check for duplicate column names
                    column_names = [col[0] for col in schema]
                    duplicate_column_names = set([col for col in column_names if column_names.count(col) > 1])
                    if len(duplicate_column_names) > 0:
                        raise ValueError("Invalid schema, column names cannot be duplicated: %s" % ", ".join(duplicate_column_names))
                elif schema is None:
                    schema = self._infer_schema(source)
                    inferred_schema = True
                else:
                    # Schema is not a list or None
                    raise TypeError("Invalid schema type: %s.  Expected a list of tuples (str, type) with the column name and data type." % type(schema))
                for item in schema:
                    if not self._is_supported_datatype(item[1]):
                        if inferred_schema:
                            raise TypeError("The %s data type was found when inferring the schema, and it is not a "
                                            "supported data type.  Instead, specify a schema that uses a supported data "
                                            "type, and enable validate_schema so that the data is converted to the proper "
                                            "data type.\n\nInferred schema: %s\n\nSupported data types: %s" %
                                            (str(item[1]), str(schema), dtypes.dtypes))
                        else:
                            raise TypeError("Invalid schema.  %s is not a supported data type.\n\nSupported data types: %s" %
                                            (str(item[1]), dtypes.dtypes))

                source = tc.sc.parallelize(source)
            if schema and validate_schema:
                # Validate schema by going through the data and checking the data type and attempting to parse it
                validate_schema_result = self.validate_pyrdd_schema(source, schema)
                source = validate_schema_result.validated_rdd
                logger.debug("%s values were unable to be parsed to the schema's data type." % validate_schema_result.bad_value_count)

            # If schema contains matrix datatype, then apply type_coercer to convert list[list] to numpy ndarray
            map_source = MatrixCoercion.schema_is_coercible(source, list(schema))
            self._frame = PythonFrame(map_source, schema)

    def _merge_types(self, type_list_a, type_list_b):
        """
        Merges two lists of data types

        :param type_list_a: First list of data types to merge
        :param type_list_b: Second list of data types to merge
        :return: List of merged data types
        """
        if not isinstance(type_list_a, list) or not isinstance(type_list_b, list):
            raise TypeError("Unable to generate schema, because schema is not a list.")
        if len(type_list_a) != len(type_list_b):
            raise ValueError("Length of each row must be the same (found rows with lengths: %s and %s)." % (len(type_list_a), len(type_list_b)))
        return [dtypes._DataTypes.merge_types(type_list_a[i], type_list_b[i]) for i in xrange(0, len(type_list_a))]

    def _infer_types_for_row(self, row):
        """
        Returns a list of data types for the data in the specified row

        :param row: List or Row of data
        :return: List of data types
        """
        inferred_types = []
        for item in row:
            if item is None:
                inferred_types.append(int)
            elif not isinstance(item, list):
                inferred_types.append(type(item))
            else:
                inferred_types.append(dtypes.vector((len(item))))
        return inferred_types

    def _infer_schema(self, data, column_names=[], sample_size=100):
        """
        Infers the schema based on the data in the RDD.

        :param sc: Spark Context
        :param data: Data used to infer schema
        :param column_names: Optional column names to use in the schema.  If no column names are provided, columns
                             are given numbered names.  If there are more columns in the RDD than there are in the
                             column_names list, remaining columns will be numbered.
        :param sample_size: Number of rows to check when inferring the schema.  Defaults to 100.
        :return: Schema
        """
        inferred_schema = []

        if isinstance(data, list):
            if len(data) > 0:
                # get the schema for the first row
                data_types = self._infer_types_for_row(data[0])

                sample_size = min(sample_size, len(data))

                for i in xrange (1, sample_size):
                    data_types = self._merge_types(data_types, self._infer_types_for_row(data[i]))

                for i, data_type in enumerate(data_types):
                    column_name = "C%s" % i
                    if len(column_names) > i:
                        column_name = column_names[i]
                    inferred_schema.append((column_name, data_type))
        else:
            raise TypeError("Unable to infer schema, because the data provided is not a list.")
        return inferred_schema

    def _is_supported_datatype(self, data_type):
        """
        Returns True if the specified data_type is supported.
        """
        supported_primitives = [int, float, long, str, unicode]
        if data_type in supported_primitives:
            return True
        elif data_type is dtypes.datetime:
            return True
        elif type(data_type) is dtypes.vector:
            return True
        elif data_type is dtypes.matrix:
            return True
        else:
            return False

    def validate_pyrdd_schema(self, pyrdd, schema):
        if isinstance(pyrdd, RDD):
            schema_length = len(schema)
            num_bad_values = self._tc.sc.accumulator(0)

            def validate_schema(row, accumulator):
                data = []
                if len(row) != schema_length:
                    raise ValueError("Length of the row (%s) does not match the schema length (%s)." % (len(row), len(schema)))
                for index, column in enumerate(schema):
                    data_type = column[1]
                    try:
                        if row[index] is not None:
                            data.append(dtypes.dtypes.cast(row[index], data_type))
                    except:
                        data.append(None)
                        accumulator += 1
                return data

            validated_rdd = pyrdd.map(lambda row: validate_schema(row, num_bad_values))

            # Force rdd to load, so that we can get a bad value count
            validated_rdd.count()

            return SchemaValidationReturn(validated_rdd, num_bad_values.value)
        else:
            raise TypeError("Unable to validate schema, because the pyrdd provided is not an RDD.")

    @staticmethod
    def _create_scala_frame(sc, scala_rdd, scala_schema):
        """call constructor in JVM"""
        return sc._jvm.org.trustedanalytics.sparktk.frame.Frame(scala_rdd, scala_schema, False)

    @staticmethod
    def _create_scala_frame_from_scala_dataframe(sc, scala_dataframe):
        """call constructor in JVM"""
        return sc._jvm.org.trustedanalytics.sparktk.frame.Frame(scala_dataframe)

    @staticmethod
    def _from_scala(tc, scala_frame):
        """creates a python Frame for the given scala Frame"""
        return Frame(tc, scala_frame)

    def _frame_to_scala(self, python_frame):
        """converts a PythonFrame to a Scala Frame"""
        scala_schema = schema_to_scala(self._tc.sc, python_frame.schema)
        scala_rdd = self._tc.sc._jvm.org.trustedanalytics.sparktk.frame.rdd.PythonJavaRdd.pythonToScala(python_frame.rdd._jrdd, scala_schema)
        return self._create_scala_frame(self._tc.sc, scala_rdd, scala_schema)

    def _is_scala_frame(self, item):
        return self._tc._jutils.is_jvm_instance_of(item, self._tc.sc._jvm.org.trustedanalytics.sparktk.frame.Frame)

    def _is_scala_rdd(self, item):
        return self._tc._jutils.is_jvm_instance_of(item, self._tc.sc._jvm.org.apache.spark.rdd.RDD)

    def _is_scala_dataframe(self, item):
        return self._tc._jutils.is_jvm_instance_of(item, self._tc.sc._jvm.org.apache.spark.sql.DataFrame)

    def _is_python_rdd(self, item):
        return isinstance(item, RDD)

    @property
    def _is_scala(self):
        """answers whether the current frame is backed by a Scala Frame"""
        return self._is_scala_frame(self._frame)

    @property
    def _is_python(self):
        """answers whether the current frame is backed by a _PythonFrame"""
        return not self._is_scala

    @property
    def _scala(self):
        """gets frame backend as Scala Frame, causes conversion if it is current not"""
        if self._is_python:
            # If schema contains matrix dataype,
            # then apply type_coercer_pymlib to convert ndarray to pymlib DenseMatrix for serialization purpose at java
            self._frame.rdd = MatrixCoercion.schema_is_coercible(self._frame.rdd, list(self._frame.schema), True)
            # convert PythonFrame to a Scala Frame"""
            scala_schema = schema_to_scala(self._tc.sc, self._frame.schema)
            scala_rdd = self._tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.pythonToScala(self._frame.rdd._jrdd, scala_schema)
            self._frame = self._create_scala_frame(self._tc.sc, scala_rdd, scala_schema)
        return self._frame

    @property
    def _python(self):
        """gets frame backend as _PythonFrame, causes conversion if it is current not"""
        if self._is_scala:
            # convert Scala Frame to a PythonFrame"""
            scala_schema = self._frame.schema()
            java_rdd =  self._tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.scalaToPython(self._frame.rdd())
            python_schema = schema_to_python(self._tc.sc, scala_schema)
            python_rdd = RDD(java_rdd, self._tc.sc)
            # If schema contains matrix datatype, then apply type_coercer to convert list[list] to numpy ndarray
            map_python_rdd = MatrixCoercion.schema_is_coercible(python_rdd, list(python_schema))
            self._frame = PythonFrame(map_python_rdd, python_schema)
        return self._frame

    ##########################################################################
    # API
    ##########################################################################

    @property
    def rdd(self):
        """pyspark RDD  (causes conversion if currently backed by a Scala RDD)"""
        return self._python.rdd

    @property
    def dataframe(self):
        """pyspark DataFrame (causes conversion through Scala)"""
        return DataFrame(self._scala.dataframe(), self._tc.sql_context)

    @property
    def schema(self):
        if self._is_scala:
            return schema_to_python(self._tc.sc, self._frame.schema())  # need ()'s on schema because it's a def in scala
        return self._frame.schema

    @property
    def column_names(self):
        """
        Column identifications in the current frame.

        :return: list of names of all the frame's columns

        Returns the names of the columns of the current frame.

        Examples
        --------

            >>> frame.column_names
            [u'name', u'age', u'tenure', u'phone']


        """
        return [name for name, data_type in self.schema]

    # Frame Operations

    from sparktk.frame.ops.add_columns import add_columns
    from sparktk.frame.ops.append import append
    from sparktk.frame.ops.assign_sample import assign_sample
    from sparktk.frame.ops.bin_column import bin_column
    from sparktk.frame.ops.binary_classification_metrics import binary_classification_metrics
    from sparktk.frame.ops.box_cox import box_cox
    from sparktk.frame.ops.categorical_summary import categorical_summary
    from sparktk.frame.ops.collect import collect
    from sparktk.frame.ops.column_median import column_median
    from sparktk.frame.ops.column_mode import column_mode
    from sparktk.frame.ops.column_summary_statistics import column_summary_statistics
    from sparktk.frame.ops.copy import copy
    from sparktk.frame.ops.correlation import correlation
    from sparktk.frame.ops.correlation_matrix import correlation_matrix
    from sparktk.frame.ops.count import count
    from sparktk.frame.ops.covariance import covariance
    from sparktk.frame.ops.covariance_matrix import covariance_matrix
    from sparktk.frame.ops.cumulative_percent import cumulative_percent
    from sparktk.frame.ops.cumulative_sum import cumulative_sum
    from sparktk.frame.ops.dot_product import dot_product
    from sparktk.frame.ops.drop_columns import drop_columns
    from sparktk.frame.ops.drop_duplicates import drop_duplicates
    from sparktk.frame.ops.drop_rows import drop_rows
    from sparktk.frame.ops.ecdf import ecdf
    from sparktk.frame.ops.entropy import entropy
    from sparktk.frame.ops.export_data import export_to_csv, export_to_jdbc, export_to_json, export_to_hbase, export_to_hive
    from sparktk.frame.ops.filter import filter
    from sparktk.frame.ops.flatten_columns import flatten_columns
    from sparktk.frame.ops.group_by import group_by
    from sparktk.frame.ops.histogram import histogram
    from sparktk.frame.ops.inspect import inspect
    from sparktk.frame.ops.join_inner import join_inner
    from sparktk.frame.ops.join_left import join_left
    from sparktk.frame.ops.join_right import join_right
    from sparktk.frame.ops.join_outer import join_outer
    from sparktk.frame.ops.map_columns import map_columns
    from sparktk.frame.ops.matrix_covariance_matrix import matrix_covariance_matrix
    from sparktk.frame.ops.matrix_pca import matrix_pca
    from sparktk.frame.ops.matrix_svd import matrix_svd
    from sparktk.frame.ops.multiclass_classification_metrics import multiclass_classification_metrics
    from sparktk.frame.ops.power_iteration_clustering import power_iteration_clustering
    from sparktk.frame.ops.quantile_bin_column import quantile_bin_column
    from sparktk.frame.ops.quantiles import quantiles
    from sparktk.frame.ops.rename_columns import rename_columns
    from sparktk.frame.ops.reverse_box_cox import reverse_box_cox
    from sparktk.frame.ops.save import save
    from sparktk.frame.ops.sort import sort
    from sparktk.frame.ops.sortedk import sorted_k
    from sparktk.frame.ops.take import take
    from sparktk.frame.ops.tally import tally
    from sparktk.frame.ops.tally_percent import tally_percent
    from sparktk.frame.ops.timeseries_augmented_dickey_fuller_test import timeseries_augmented_dickey_fuller_test
    from sparktk.frame.ops.timeseries_breusch_godfrey_test import timeseries_breusch_godfrey_test
    from sparktk.frame.ops.timeseries_breusch_pagan_test import timeseries_breusch_pagan_test
    from sparktk.frame.ops.timeseries_durbin_watson_test import timeseries_durbin_watson_test
    from sparktk.frame.ops.timeseries_from_observations import timeseries_from_observations
    from sparktk.frame.ops.timeseries_slice import timeseries_slice
    from sparktk.frame.ops.to_pandas import to_pandas
    from sparktk.frame.ops.topk import top_k
    from sparktk.frame.ops.unflatten_columns import unflatten_columns

Ancestors (in MRO)

Frame
__builtin__.object

Instance variables

var column_names

Column identifications in the current frame.

Returns:

list of names of all the frame's columns

Returns the names of the columns of the current frame.

Examples:

>>> frame.column_names
[u'name', u'age', u'tenure', u'phone']

var dataframe

pyspark DataFrame (causes conversion through Scala)

var rdd

pyspark RDD (causes conversion if currently backed by a Scala RDD)

var schema

Methods

def __init__(

self, tc, source, schema=None, validate_schema=False)

(Private constructor -- use tc.frame.create or other methods available from the TkContext)

Show source ≡

def __init__(self, tc, source, schema=None, validate_schema=False):
    """(Private constructor -- use tc.frame.create or other methods available from the TkContext)"""
    self._tc = tc
    if self._is_scala_frame(source):
        self._frame = source
    elif self._is_scala_rdd(source):
        scala_schema = schema_to_scala(tc.sc, schema)
        self._frame = self._create_scala_frame(tc.sc, source, scala_schema)
    elif self._is_scala_dataframe(source):
        self._frame = self._create_scala_frame_from_scala_dataframe(tc.sc, source)
    elif isinstance(source, DataFrame):
        self._frame = self._create_scala_frame_from_scala_dataframe(tc.sc, source._jdf)
    elif isinstance(source, PythonFrame):
        self._frame = source
    else:
        if not isinstance(source, RDD):
            if not isinstance(source, list) or (len(source) > 0 and any(not isinstance(row, (list, tuple)) for row in source)):
                raise TypeError("Invalid data source.  The data parameter must be a 2-dimensional list (list of row data) or an RDD.")
            inferred_schema = False
            if isinstance(schema, list):
                if all(isinstance(item, basestring) for item in schema):
                    # check if schema is just a list of column names (versus string and data type tuples)
                    schema = self._infer_schema(source, schema)
                    inferred_schema = True
                elif not all(isinstance(item, tuple) and
                              len(item) == 2 and
                              isinstance(item[0], basestring) for item in schema):
                    raise TypeError("Invalid schema.  Expected a list of tuples (str, type) with the column name and data type, but received type %s." % type(schema))
                # check for duplicate column names
                column_names = [col[0] for col in schema]
                duplicate_column_names = set([col for col in column_names if column_names.count(col) > 1])
                if len(duplicate_column_names) > 0:
                    raise ValueError("Invalid schema, column names cannot be duplicated: %s" % ", ".join(duplicate_column_names))
            elif schema is None:
                schema = self._infer_schema(source)
                inferred_schema = True
            else:
                # Schema is not a list or None
                raise TypeError("Invalid schema type: %s.  Expected a list of tuples (str, type) with the column name and data type." % type(schema))
            for item in schema:
                if not self._is_supported_datatype(item[1]):
                    if inferred_schema:
                        raise TypeError("The %s data type was found when inferring the schema, and it is not a "
                                        "supported data type.  Instead, specify a schema that uses a supported data "
                                        "type, and enable validate_schema so that the data is converted to the proper "
                                        "data type.\n\nInferred schema: %s\n\nSupported data types: %s" %
                                        (str(item[1]), str(schema), dtypes.dtypes))
                    else:
                        raise TypeError("Invalid schema.  %s is not a supported data type.\n\nSupported data types: %s" %
                                        (str(item[1]), dtypes.dtypes))
            source = tc.sc.parallelize(source)
        if schema and validate_schema:
            # Validate schema by going through the data and checking the data type and attempting to parse it
            validate_schema_result = self.validate_pyrdd_schema(source, schema)
            source = validate_schema_result.validated_rdd
            logger.debug("%s values were unable to be parsed to the schema's data type." % validate_schema_result.bad_value_count)
        # If schema contains matrix datatype, then apply type_coercer to convert list[list] to numpy ndarray
        map_source = MatrixCoercion.schema_is_coercible(source, list(schema))
        self._frame = PythonFrame(map_source, schema)

def add_columns(

self, func, schema)

Add columns to current frame.

Assigns data to column based on evaluating a function for each row.

Notes:

The row |UDF| ('func') must return a value in the same format as specified by the schema.

Parameters:

func

(UDF):

Function which takes the values in the row and produces a value, or collection of values, for the new cell(s).

schema

(List[(str,type)]):

Schema for the column(s) being added.

Examples:

Given our frame, let's add a column which has how many years the person has been over 18

>>> frame = tc.frame.create([['Fred',39,16,'555-1234'],
...                          ['Susan',33,3,'555-0202'],
...                          ['Thurston',65,26,'555-4510'],
...                          ['Judy',44,14,'555-2183']],
...                         schema=[('name', str), ('age', int), ('tenure', int), ('phone', str)])

>>> frame.inspect()
[#]  name      age  tenure  phone
====================================
[0]  Fred       39      16  555-1234
[1]  Susan      33       3  555-0202
[2]  Thurston   65      26  555-4510
[3]  Judy       44      14  555-2183

>>> frame.add_columns(lambda row: row.age - 18, ('adult_years', int))

>>> frame.inspect()
[#]  name      age  tenure  phone     adult_years
=================================================
[0]  Fred       39      16  555-1234           21
[1]  Susan      33       3  555-0202           15
[2]  Thurston   65      26  555-4510           47
[3]  Judy       44      14  555-2183           26

Multiple columns can be added at the same time. Let's add percentage of life and percentage of adult life in one call, which is more efficient.

>>> frame.add_columns(lambda row: [row.tenure / float(row.age), row.tenure / float(row.adult_years)],
...                   [("of_age", float), ("of_adult", float)])

>>> frame.inspect(round=2)
[#]  name      age  tenure  phone     adult_years  of_age  of_adult
===================================================================
[0]  Fred       39      16  555-1234           21    0.41      0.76
[1]  Susan      33       3  555-0202           15    0.09      0.20
[2]  Thurston   65      26  555-4510           47    0.40      0.55
[3]  Judy       44      14  555-2183           26    0.32      0.54

Note that the function returns a list, and therefore the schema also needs to be a list.

It is not necessary to use lambda syntax, any function will do, as long as it takes a single row argument. We can also call other local functions within.

Let's add a column which shows the amount of person's name based on their adult tenure percentage.

>>> def percentage_of_string(string, percentage):
...     '''returns a substring of the given string according to the given percentage'''
...     substring_len = int(percentage * len(string))
...     return string[:substring_len]

>>> def add_name_by_adult_tenure(row):
...     return percentage_of_string(row.name, row.of_adult)

>>> frame.add_columns(add_name_by_adult_tenure, ('tenured_name', unicode))

>>> frame.inspect(columns=['name', 'of_adult', 'tenured_name'], round=2)
[#]  name      of_adult  tenured_name
=====================================
[0]  Fred          0.76  Fre
[1]  Susan         0.20  S
[2]  Thurston      0.55  Thur
[3]  Judy          0.54  Ju

Let's add a name based on tenure percentage of age.

>>> frame.add_columns(lambda row: percentage_of_string(row.name, row.of_age),
...                   ('tenured_name_age', unicode))

>>> frame.inspect(round=2)
[#]  name      age  tenure  phone     adult_years  of_age  of_adult
===================================================================
[0]  Fred       39      16  555-1234           21    0.41      0.76
[1]  Susan      33       3  555-0202           15    0.09      0.20
[2]  Thurston   65      26  555-4510           47    0.40      0.55
[3]  Judy       44      14  555-2183           26    0.32      0.54
<BLANKLINE>
[#]  tenured_name  tenured_name_age
===================================
[0]  Fre           F
[1]  S
[2]  Thur          Thu
[3]  Ju            J

Show source ≡

def add_columns(self, func, schema):
    """
    Add columns to current frame.

    Assigns data to column based on evaluating a function for each row.

    Notes
    -----

    1.  The row |UDF| ('func') must return a value in the same format as
        specified by the schema.

    Parameters
    ----------

    :param func: (UDF) Function which takes the values in the row and produces a value, or collection of values, for the new cell(s).
    :param schema: (List[(str,type)]) Schema for the column(s) being added.

    Examples
    --------

    Given our frame, let's add a column which has how many years the person has been over 18

        >>> frame = tc.frame.create([['Fred',39,16,'555-1234'],
        ...                          ['Susan',33,3,'555-0202'],
        ...                          ['Thurston',65,26,'555-4510'],
        ...                          ['Judy',44,14,'555-2183']],
        ...                         schema=[('name', str), ('age', int), ('tenure', int), ('phone', str)])

        >>> frame.inspect()
        [#]  name      age  tenure  phone
        ====================================
        [0]  Fred       39      16  555-1234
        [1]  Susan      33       3  555-0202
        [2]  Thurston   65      26  555-4510
        [3]  Judy       44      14  555-2183

        >>> frame.add_columns(lambda row: row.age - 18, ('adult_years', int))

        >>> frame.inspect()
        [#]  name      age  tenure  phone     adult_years
        =================================================
        [0]  Fred       39      16  555-1234           21
        [1]  Susan      33       3  555-0202           15
        [2]  Thurston   65      26  555-4510           47
        [3]  Judy       44      14  555-2183           26


    Multiple columns can be added at the same time.  Let's add percentage of
    life and percentage of adult life in one call, which is more efficient.

        >>> frame.add_columns(lambda row: [row.tenure / float(row.age), row.tenure / float(row.adult_years)],
        ...                   [("of_age", float), ("of_adult", float)])

        >>> frame.inspect(round=2)
        [#]  name      age  tenure  phone     adult_years  of_age  of_adult
        ===================================================================
        [0]  Fred       39      16  555-1234           21    0.41      0.76
        [1]  Susan      33       3  555-0202           15    0.09      0.20
        [2]  Thurston   65      26  555-4510           47    0.40      0.55
        [3]  Judy       44      14  555-2183           26    0.32      0.54

    Note that the function returns a list, and therefore the schema also needs to be a list.

    It is not necessary to use lambda syntax, any function will do, as long as it takes a single row argument.  We
    can also call other local functions within.

    Let's add a column which shows the amount of person's name based on their adult tenure percentage.

        >>> def percentage_of_string(string, percentage):
        ...     '''returns a substring of the given string according to the given percentage'''
        ...     substring_len = int(percentage * len(string))
        ...     return string[:substring_len]

        >>> def add_name_by_adult_tenure(row):
        ...     return percentage_of_string(row.name, row.of_adult)

        >>> frame.add_columns(add_name_by_adult_tenure, ('tenured_name', unicode))

        >>> frame.inspect(columns=['name', 'of_adult', 'tenured_name'], round=2)
        [#]  name      of_adult  tenured_name
        =====================================
        [0]  Fred          0.76  Fre
        [1]  Susan         0.20  S
        [2]  Thurston      0.55  Thur
        [3]  Judy          0.54  Ju


    Let's add a name based on tenure percentage of age.

        >>> frame.add_columns(lambda row: percentage_of_string(row.name, row.of_age),
        ...                   ('tenured_name_age', unicode))

        >>> frame.inspect(round=2)
        [#]  name      age  tenure  phone     adult_years  of_age  of_adult
        ===================================================================
        [0]  Fred       39      16  555-1234           21    0.41      0.76
        [1]  Susan      33       3  555-0202           15    0.09      0.20
        [2]  Thurston   65      26  555-4510           47    0.40      0.55
        [3]  Judy       44      14  555-2183           26    0.32      0.54
        
        [#]  tenured_name  tenured_name_age
        ===================================
        [0]  Fre           F
        [1]  S
        [2]  Thur          Thu
        [3]  Ju            J


    """

    schema_helper.validate(schema)
    schema_helper.validate_is_mergeable(self._tc, self.schema, schema)

    row = Row(self.schema)

    def add_columns_func(r):
        row._set_data(r)
        return func(row)
    if isinstance(schema, list):
        self._python.rdd = self._python.rdd.map(lambda r: r + add_columns_func(r))
        self._python.schema.extend(schema)
    else:
        self._python.rdd = self._python.rdd.map(lambda r: r + [add_columns_func(r)])
        self._python.schema.append(schema)

def append(

self, frame)

Adds more data to the current frame.

Parameters:

frame

(Frame):

Frame of data to append to the current frame.

Examples:

In this example, we start off by creating a frame of animals.

>>> animals = tc.frame.create([['dog', 'snoopy'],['cat', 'tom'],['bear', 'yogi'],['mouse', 'jerry']],
...                       [('animal', str), ('name', str)])
[===Job Progress===]

>>> animals.inspect()
[#]  animal  name
===================
[0]  dog     snoopy
[1]  cat     tom
[2]  bear    yogi
[3]  mouse   jerry

Then, we append a frame that will add a few more animals to the original frame.

>>> animals.append(tc.frame.create([['donkey'],['elephant'], ['ostrich']], [('animal', str)]))
[===Job Progress===]

>>> animals.inspect()
[#]  animal    name
=====================
[0]  dog       snoopy
[1]  cat       tom
[2]  bear      yogi
[3]  mouse     jerry
[4]  donkey    None
[5]  elephant  None
[6]  ostrich   None

The data we added didn't have names, so None values were inserted for the new rows.

Show source ≡

def append(self, frame):
    """
    Adds more data to the current frame.

    Parameters
    ----------

    :param frame: (Frame) Frame of data to append to the current frame.

    Examples
    --------

    In this example, we start off by creating a frame of animals.

        >>> animals = tc.frame.create([['dog', 'snoopy'],['cat', 'tom'],['bear', 'yogi'],['mouse', 'jerry']],
        ...                       [('animal', str), ('name', str)])
        [===Job Progress===]

        >>> animals.inspect()
        [#]  animal  name
        ===================
        [0]  dog     snoopy
        [1]  cat     tom
        [2]  bear    yogi
        [3]  mouse   jerry

    Then, we append a frame that will add a few more animals to the original frame.

        >>> animals.append(tc.frame.create([['donkey'],['elephant'], ['ostrich']], [('animal', str)]))
        [===Job Progress===]

        >>> animals.inspect()
        [#]  animal    name
        =====================
        [0]  dog       snoopy
        [1]  cat       tom
        [2]  bear      yogi
        [3]  mouse     jerry
        [4]  donkey    None
        [5]  elephant  None
        [6]  ostrich   None


    The data we added didn't have names, so None values were inserted for the new rows.

    """
    from sparktk.frame.frame import Frame
    if not isinstance(frame, Frame):
        raise TypeError("frame must be a Frame type, but is: {0}".format(type(frame)))
    self._scala.append(frame._scala)

def assign_sample(

self, sample_percentages, sample_labels=None, output_column=None, seed=None)

Randomly group rows into user-defined classes.

Parameters:

sample_percentages

(List[float]):

Entries are non-negative and sum to 1. (See the note below.) If the *i*'th entry of the list is *p*, then then each row receives label *i* with independent probability *p*.

sample_labels

(Optional[List[str]]):

Names to be used for the split classes. Defaults to 'TR', 'TE', 'VA' when the length of *sample_percentages* is 3, and defaults to Sample_0, Sample_1, ... otherwise.

output_column

(str):

Name of the new column which holds the labels generated by the function

seed

(int):

Random seed used to generate the labels. Defaults to 0.

Randomly assign classes to rows given a vector of percentages. The table receives an additional column that contains a random label. The random label is generated by a probability distribution function. The distribution function is specified by the sample_percentages, a list of floating point values, which add up to 1. The labels are non-negative integers drawn from the range :math:[ 0, len(S) - 1] where :math:S is the sample_percentages.

Notes:

The sample percentages provided by the user are preserved to at least eight decimal places, but beyond this there may be small changes due to floating point imprecision.

In particular:

The engine validates that the sum of probabilities sums to 1.0 within eight decimal places and returns an error if the sum falls outside of this range.
The probability of the final class is clamped so that each row receives a valid label with probability one.

Examples:

Consider this simple frame.

>>> frame.inspect()
[#]  blip  id
=============
[0]  abc    0
[1]  def    1
[2]  ghi    2
[3]  jkl    3
[4]  mno    4
[5]  pqr    5
[6]  stu    6
[7]  vwx    7
[8]  yza    8
[9]  bcd    9

We'll assign labels to each row according to a rough 40-30-30 split, for "train", "test", and "validate".

>>> frame.assign_sample([0.4, 0.3, 0.3])
[===Job Progress===]

>>> frame.inspect()
[#]  blip  id  sample_bin
=========================
[0]  abc    0  VA
[1]  def    1  TR
[2]  ghi    2  TE
[3]  jkl    3  TE
[4]  mno    4  TE
[5]  pqr    5  TR
[6]  stu    6  TR
[7]  vwx    7  VA
[8]  yza    8  VA
[9]  bcd    9  VA

Now the frame has a new column named "sample_bin" with a string label. Values in the other columns are unaffected.

Here it is again, this time specifying labels, output column and random seed

>>> frame.assign_sample([0.2, 0.2, 0.3, 0.3],
...                     ["cat1", "cat2", "cat3", "cat4"],
...                     output_column="cat",
...                     seed=12)
[===Job Progress===]

>>> frame.inspect()
[#]  blip  id  sample_bin  cat
===============================
[0]  abc    0  VA          cat4
[1]  def    1  TR          cat2
[2]  ghi    2  TE          cat3
[3]  jkl    3  TE          cat4
[4]  mno    4  TE          cat1
[5]  pqr    5  TR          cat3
[6]  stu    6  TR          cat2
[7]  vwx    7  VA          cat3
[8]  yza    8  VA          cat3
[9]  bcd    9  VA          cat4

Show source ≡

def assign_sample(self, sample_percentages,
                  sample_labels = None,
                  output_column = None,
                  seed = None):
    """
    Randomly group rows into user-defined classes.

    Parameters
    ----------

    :param sample_percentages: (List[float]) Entries are non-negative and sum to 1. (See the note below.)
                         If the *i*'th entry of the  list is *p*, then then each row
                         receives label *i* with independent probability *p*.
    :param sample_labels: (Optional[List[str]]) Names to be used for the split classes. Defaults to 'TR', 'TE',
                    'VA' when the length of *sample_percentages* is 3, and defaults
                    to Sample_0, Sample_1, ... otherwise.
    :param output_column: (str) Name of the new column which holds the labels generated by the function
    :param seed: (int) Random seed used to generate the labels.  Defaults to 0.

    Randomly assign classes to rows given a vector of percentages.
    The table receives an additional column that contains a random label.
    The random label is generated by a probability distribution function.
    The distribution function is specified by the sample_percentages, a list of
    floating point values, which add up to 1.
    The labels are non-negative integers drawn from the range
    :math:`[ 0, len(S) - 1]` where :math:`S` is the sample_percentages.

    Notes
    -----

    The sample percentages provided by the user are preserved to at least eight
    decimal places, but beyond this there may be small changes due to floating
    point imprecision.

    In particular:

    1.  The engine validates that the sum of probabilities sums to 1.0 within
    eight decimal places and returns an error if the sum falls outside of this
    range.
    +  The probability of the final class is clamped so that each row receives a
    valid label with probability one.

    Examples
    --------


    Consider this simple frame.

        >>> frame.inspect()
        [#]  blip  id
        =============
        [0]  abc    0
        [1]  def    1
        [2]  ghi    2
        [3]  jkl    3
        [4]  mno    4
        [5]  pqr    5
        [6]  stu    6
        [7]  vwx    7
        [8]  yza    8
        [9]  bcd    9

    We'll assign labels to each row according to a rough 40-30-30 split, for
    "train", "test", and "validate".

        >>> frame.assign_sample([0.4, 0.3, 0.3])
        [===Job Progress===]

        >>> frame.inspect()
        [#]  blip  id  sample_bin
        =========================
        [0]  abc    0  VA
        [1]  def    1  TR
        [2]  ghi    2  TE
        [3]  jkl    3  TE
        [4]  mno    4  TE
        [5]  pqr    5  TR
        [6]  stu    6  TR
        [7]  vwx    7  VA
        [8]  yza    8  VA
        [9]  bcd    9  VA


    Now the frame  has a new column named "sample_bin" with a string label.
    Values in the other columns are unaffected.

    Here it is again, this time specifying labels, output column and random seed

        >>> frame.assign_sample([0.2, 0.2, 0.3, 0.3],
        ...                     ["cat1", "cat2", "cat3", "cat4"],
        ...                     output_column="cat",
        ...                     seed=12)
        [===Job Progress===]

        >>> frame.inspect()
        [#]  blip  id  sample_bin  cat
        ===============================
        [0]  abc    0  VA          cat4
        [1]  def    1  TR          cat2
        [2]  ghi    2  TE          cat3
        [3]  jkl    3  TE          cat4
        [4]  mno    4  TE          cat1
        [5]  pqr    5  TR          cat3
        [6]  stu    6  TR          cat2
        [7]  vwx    7  VA          cat3
        [8]  yza    8  VA          cat3
        [9]  bcd    9  VA          cat4

    """

    self._scala.assignSample(self._tc.jutils.convert.to_scala_list_double(sample_percentages),
                             self._tc.jutils.convert.to_scala_option(sample_labels),
                             self._tc.jutils.convert.to_scala_option(output_column),
                             self._tc.jutils.convert.to_scala_option(seed))

def bin_column(

self, column_name, bins=None, include_lowest=True, strict_binning=False, bin_column_name=None)

Summarize rows of data based on the value in a single column by sorting them into bins, or groups, based on a list of bin cutoff points or a specified number of equal-width bins.

Parameters:

column_name

(str):

Name of the column to bin

bins

(Optional[List[float]]):

Either a single value representing the number of equal-width bins to create, or an array of values containing bin cutoff points. Array can be list or tuple. If an array is provided, values must be progressively increasing. All bin boundaries must be included, so, with N bins, you need N+1 values. Default (None or Empty List) is equal-width bins where the maximum number of bins is the Square-root choice :math:`\lfloor \sqrt{m} floor`, where :math:`m` is the number of rows.

include_lowest

(bool):

Specify how the boundary conditions are handled. ``True`` indicates that the lower bound of the bin is inclusive. ``False`` indicates that the upper bound is inclusive. Default is ``True``.

strict_binning

(bool):

Specify how values outside of the cutoffs array should be binned. If set to ``True``, each value less than cutoffs[0] or greater than cutoffs[-1] will be assigned a bin value of -1. If set to ``False``, values less than cutoffs[0] will be included in the first bin while values greater than cutoffs[-1] will be included in the final bin.

bin_column_name

(str):

The name for the new binned column. Default is ``<column_name>_binned``

Returns

(List[float]):

a list containing the edges of each bin

Notes:

Bins IDs are 0-index, in other words, the lowest bin number is 0.
The first and last cutoffs are always included in the bins. When include_lowest is True, the last bin includes both cutoffs. When include_lowest is False, the first bin (bin 0) includes both cutoffs.

Examples:

For these examples, we will use a frame with column a accessed by a Frame object my_frame:

>>> frame.inspect(n=11)
[##]  a
========
[0]    1
[1]    1
[2]    2
[3]    3
[4]    5
[5]    8
[6]   13
[7]   21
[8]   34
[9]   55
[10]  89

Modify the frame with a column showing what bin the data is in, by specifying cutoffs for the bin edges. The data values should use strict_binning:

>>> frame.bin_column('a', [5, 12, 25, 60], include_lowest=True,
... strict_binning=True, bin_column_name='binned_using_cutoffs')
[===Job Progress===]

>>> frame.inspect(n=11)
[##]  a   binned_using_cutoffs
==============================
[0]    1                    -1
[1]    1                    -1
[2]    2                    -1
[3]    3                    -1
[4]    5                     0
[5]    8                     0
[6]   13                     1
[7]   21                     1
[8]   34                     2
[9]   55                     2
[10]  89                    -1

Modify the frame with a column showing what bin the data is in. The data value should not use strict_binning:

>>> frame.bin_column('a', [5, 12, 25, 60], include_lowest=True,
... strict_binning=False, bin_column_name='binned_using_cutoffs')
[===Job Progress===]

>>> frame.inspect(n=11)
[##]  a   binned_using_cutoffs
==============================
[0]    1                     0
[1]    1                     0
[2]    2                     0
[3]    3                     0
[4]    5                     0
[5]    8                     0
[6]   13                     1
[7]   21                     1
[8]   34                     2
[9]   55                     2
[10]  89                     2

Modify the frame with a column showing what bin the data is in. The bins should be lower inclusive:

>>> frame.bin_column('a', [1,5,34,55,89], include_lowest=True,
... strict_binning=False, bin_column_name='binned_using_cutoffs')
[===Job Progress===]

>>> frame.inspect( n=11 )
[##]  a   binned_using_cutoffs
==============================
[0]    1                     0
[1]    1                     0
[2]    2                     0
[3]    3                     0
[4]    5                     1
[5]    8                     1
[6]   13                     1
[7]   21                     1
[8]   34                     2
[9]   55                     3
[10]  89                     3

Modify the frame with a column showing what bin the data is in. The bins should be upper inclusive:

>>> frame.bin_column('a', [1,5,34,55,89], include_lowest=False,
... strict_binning=True, bin_column_name='binned_using_cutoffs')
[===Job Progress===]

>>> frame.inspect( n=11 )
[##]  a   binned_using_cutoffs
==============================
[0]    1                     0
[1]    1                     0
[2]    2                     0
[3]    3                     0
[4]    5                     0
[5]    8                     1
[6]   13                     1
[7]   21                     1
[8]   34                     1
[9]   55                     2
[10]  89                     3

Modify the frame with a column of 3 equal-width bins. This also returns the cutoffs that were used for creating the bins.

>>> cutoffs = frame.bin_column('a', 3, bin_column_name='equal_width_bins')

>>> print cutoffs
[1.0, 30.333333333333332, 59.666666666666664, 89.0]

>>> frame.inspect(n=frame.count())
[##]  a   equal_width_bins
==========================
[0]    1                 0
[1]    1                 0
[2]    2                 0
[3]    3                 0
[4]    5                 0
[5]    8                 0
[6]   13                 0
[7]   21                 0
[8]   34                 1
[9]   55                 1
[10]  89                 2

Show source ≡

def bin_column(self, column_name, bins=None, include_lowest=True, strict_binning=False, bin_column_name=None):
    """
    Summarize rows of data based on the value in a single column by sorting them
    into bins, or groups, based on a list of bin cutoff points or a specified number of
    equal-width bins.

    Parameters
    ----------

    :param column_name: (str) Name of the column to bin
    :param bins: (Optional[List[float]]) Either a single value representing the number of equal-width bins to create, or an array of values
     containing bin cutoff points. Array can be list or tuple. If an array is provided, values must be progressively
     increasing. All bin boundaries must be included, so, with N bins, you need N+1 values.
     Default (None or Empty List) is equal-width bins where the maximum number of bins is the Square-root choice
     :math:`\lfloor \sqrt{m} \rfloor`, where :math:`m` is the number of rows.
    :param include_lowest: (bool) Specify how the boundary conditions are handled. ``True`` indicates that the lower bound
     of the bin is inclusive. ``False`` indicates that the upper bound is inclusive. Default is ``True``.
    :param strict_binning: (bool) Specify how values outside of the cutoffs array should be binned. If set to ``True``, each
     value less than cutoffs[0] or greater than cutoffs[-1] will be assigned a bin value of -1. If set to ``False``,
     values less than cutoffs[0] will be included in the first bin while values greater than cutoffs[-1] will be
     included in the final bin.
    :param bin_column_name: (str) The name for the new binned column.  Default is ``<column_name>_binned``
    :return: (List[float]) a list containing the edges of each bin

    Notes
    -----

    1.  Bins IDs are 0-index, in other words, the lowest bin number is 0.
    +   The first and last cutoffs are always included in the bins.
        When *include_lowest* is ``True``, the last bin includes both cutoffs.
        When *include_lowest* is ``False``, the first bin (bin 0) includes both
        cutoffs.

    Examples
    --------

    For these examples, we will use a frame with column *a* accessed by a Frame
    object *my_frame*:

        >>> frame.inspect(n=11)
        [##]  a
        ========
        [0]    1
        [1]    1
        [2]    2
        [3]    3
        [4]    5
        [5]    8
        [6]   13
        [7]   21
        [8]   34
        [9]   55
        [10]  89

    Modify the frame with a column showing what bin the data is in, by
    specifying cutoffs for the bin edges.
    The data values should use strict_binning:

        >>> frame.bin_column('a', [5, 12, 25, 60], include_lowest=True,
        ... strict_binning=True, bin_column_name='binned_using_cutoffs')
        [===Job Progress===]

        >>> frame.inspect(n=11)
        [##]  a   binned_using_cutoffs
        ==============================
        [0]    1                    -1
        [1]    1                    -1
        [2]    2                    -1
        [3]    3                    -1
        [4]    5                     0
        [5]    8                     0
        [6]   13                     1
        [7]   21                     1
        [8]   34                     2
        [9]   55                     2
        [10]  89                    -1


    Modify the frame with a column showing what bin the data is in.
    The data value should not use strict_binning:

        >>> frame.bin_column('a', [5, 12, 25, 60], include_lowest=True,
        ... strict_binning=False, bin_column_name='binned_using_cutoffs')
        [===Job Progress===]

        >>> frame.inspect(n=11)
        [##]  a   binned_using_cutoffs
        ==============================
        [0]    1                     0
        [1]    1                     0
        [2]    2                     0
        [3]    3                     0
        [4]    5                     0
        [5]    8                     0
        [6]   13                     1
        [7]   21                     1
        [8]   34                     2
        [9]   55                     2
        [10]  89                     2

    Modify the frame with a column showing what bin the data is in.
    The bins should be lower inclusive:

        >>> frame.bin_column('a', [1,5,34,55,89], include_lowest=True,
        ... strict_binning=False, bin_column_name='binned_using_cutoffs')
        [===Job Progress===]

        >>> frame.inspect( n=11 )
        [##]  a   binned_using_cutoffs
        ==============================
        [0]    1                     0
        [1]    1                     0
        [2]    2                     0
        [3]    3                     0
        [4]    5                     1
        [5]    8                     1
        [6]   13                     1
        [7]   21                     1
        [8]   34                     2
        [9]   55                     3
        [10]  89                     3

    Modify the frame with a column showing what bin the data is in.
    The bins should be upper inclusive:

        >>> frame.bin_column('a', [1,5,34,55,89], include_lowest=False,
        ... strict_binning=True, bin_column_name='binned_using_cutoffs')
        [===Job Progress===]

        >>> frame.inspect( n=11 )
        [##]  a   binned_using_cutoffs
        ==============================
        [0]    1                     0
        [1]    1                     0
        [2]    2                     0
        [3]    3                     0
        [4]    5                     0
        [5]    8                     1
        [6]   13                     1
        [7]   21                     1
        [8]   34                     1
        [9]   55                     2
        [10]  89                     3

    Modify the frame with a column of 3 equal-width bins.  This also
    returns the cutoffs that were used for creating the bins.

        >>> cutoffs = frame.bin_column('a', 3, bin_column_name='equal_width_bins')

        >>> print cutoffs
        [1.0, 30.333333333333332, 59.666666666666664, 89.0]

        >>> frame.inspect(n=frame.count())
        [##]  a   equal_width_bins
        ==========================
        [0]    1                 0
        [1]    1                 0
        [2]    2                 0
        [3]    3                 0
        [4]    5                 0
        [5]    8                 0
        [6]   13                 0
        [7]   21                 0
        [8]   34                 1
        [9]   55                 1
        [10]  89                 2

    """
    if isinstance(bins, tuple):
        bins = list(bins)
    elif not isinstance(bins, list):
        bins = [bins]
    return self._tc.jutils.convert.from_scala_seq(self._scala.binColumn(column_name,
                                self._tc.jutils.convert.to_scala_option_list_double(bins),
                                include_lowest,
                                strict_binning,
                                self._tc.jutils.convert.to_scala_option(bin_column_name)))

def binary_classification_metrics(

self, label_column, pred_column, pos_label, beta=1.0, frequency_column=None)

Statistics of accuracy, precision, and others for a binary classification model.

Parameters:

label_column

(str):

The name of the column containing the correct label for each instance.

pred_column

(str):

The name of the column containing the predicted label for each instance.

pos_label

(Any):

The value to be interpreted as a positive instance for binary classification.

beta

(Optional[float]):

This is the beta value to use for :math:`F_{ eta}` measure (default F1 measure is computed); must be greater than zero. Defaults is 1.

frequency_column

(Optional[str]):

The name of an optional column containing the frequency of observations.

Returns

(ClassificationMetricsValue):

The data returned is composed of multiple components:
<object>.accuracy : double
<object>.confusion_matrix : table
<object>.f_measure : double
<object>.precision : double
<object>.recall : double

Calculate the accuracy, precision, confusion_matrix, recall and :math:F_{ eta} measure for a classification model.

The f_measure result is the :math:F_{ eta} measure for a classification model. The :math:F_{ eta} measure of a binary classification model is the harmonic mean of precision and recall. If we let:
- beta :math:\equiv eta,
- :math:T_{P} denotes the number of true positives,
- :math:F_{P} denotes the number of false positives, and
- :math:F_{N} denotes the number of false negatives
then:

.. math::

F_{ eta} = (1 + eta ^ 2) * rac{ rac{T_{P}}{T_{P} + F_{P}} * rac{T_{P}}{T_{P} + F_{N}}}{ eta ^ 2 * rac{T_{P}}{T_{P} + F_{P}} + rac{T_{P}}{T_{P} + F_{N}}}

The :math:F_{ eta} measure for a multi-class classification model is computed as the weighted average of the :math:F_{ eta} measure for each label, where the weight is the number of instances of each label. The determination of binary vs. multi-class is automatically inferred from the data.
- The recall result of a binary classification model is the proportion of positive instances that are correctly identified. If we let :math:T_{P} denote the number of true positives and :math:F_{N} denote the number of false negatives, then the model recall is given by :math:rac {T_{P}} {T_{P} + F_{N}}.
- The precision of a binary classification model is the proportion of predicted positive instances that are correctly identified. If we let :math:T_{P} denote the number of true positives and :math:F_{P} denote the number of false positives, then the model precision is given by: :math:rac {T_{P}} {T_{P} + F_{P}}.
- The accuracy of a classification model is the proportion of predictions that are correctly identified. If we let :math:T_{P} denote the number of true positives, :math:T_{N} denote the number of true negatives, and :math:K denote the total number of classified instances, then the model accuracy is given by: :math:rac{T_{P} + T_{N}}{K}.
The confusion_matrix result is a confusion matrix for a binary classifier model, formatted for human readability.

Examples:

Consider Frame my_frame, which contains the data

>>> my_frame.inspect()
[#]  a      b  labels  predictions
==================================
[0]  red    1       0            0
[1]  blue   3       1            0
[2]  green  1       0            0
[3]  green  0       1            1

>>> cm = my_frame.binary_classification_metrics('labels', 'predictions', 1, 1)
[===Job Progress===]

>>> cm.f_measure
0.6666666666666666

>>> cm.recall
0.5

>>> cm.accuracy
0.75

>>> cm.precision
1.0

>>> cm.confusion_matrix
            Predicted_Pos  Predicted_Neg
Actual_Pos              1              1
Actual_Neg              0              2

Show source ≡

def binary_classification_metrics(self, label_column, pred_column, pos_label, beta=1.0, frequency_column=None):
    """
    Statistics of accuracy, precision, and others for a binary classification model.

    Parameters
    ----------

    :param label_column: (str) The name of the column containing the correct label for each instance.
    :param pred_column: (str) The name of the column containing the predicted label for each instance.
    :param pos_label: (Any) The value to be interpreted as a positive instance for binary classification.
    :param beta: (Optional[float]) This is the beta value to use for :math:`F_{ \beta}` measure (default F1 measure is computed);
    must be greater than zero. Defaults is 1.
    :param frequency_column: (Optional[str]) The name of an optional column containing the frequency of observations.
    :return: (ClassificationMetricsValue) The data returned is composed of multiple components:

        <object>.accuracy : double

        <object>.confusion_matrix : table

        <object>.f_measure : double

        <object>.precision : double

        <object>.recall : double



    Calculate the accuracy, precision, confusion_matrix, recall and :math:`F_{ \beta}` measure for a
        classification model.

    *   The **f_measure** result is the :math:`F_{ \beta}` measure for a
        classification model.
        The :math:`F_{ \beta}` measure of a binary classification model is the
        harmonic mean of precision and recall.
        If we let:

        * beta :math:`\equiv \beta`,
        * :math:`T_{P}` denotes the number of true positives,
        * :math:`F_{P}` denotes the number of false positives, and
        * :math:`F_{N}` denotes the number of false negatives

        then:

        .. math::

        F_{ \beta} = (1 + \beta ^ 2) * \frac{ \frac{T_{P}}{T_{P} + F_{P}} * \
                \frac{T_{P}}{T_{P} + F_{N}}}{ \beta ^ 2 * \frac{T_{P}}{T_{P} + \
                                                                         F_{P}}  + \frac{T_{P}}{T_{P} + F_{N}}}

        The :math:`F_{ \beta}` measure for a multi-class classification model is
        computed as the weighted average of the :math:`F_{ \beta}` measure for
            each label, where the weight is the number of instances of each label.
        The determination of binary vs. multi-class is automatically inferred
        from the data.

        *   The **recall** result of a binary classification model is the proportion
        of positive instances that are correctly identified.
        If we let :math:`T_{P}` denote the number of true positives and
        :math:`F_{N}` denote the number of false negatives, then the model
        recall is given by :math:`\frac {T_{P}} {T_{P} + F_{N}}`.

        *   The **precision** of a binary classification model is the proportion of
        predicted positive instances that are correctly identified.
        If we let :math:`T_{P}` denote the number of true positives and
        :math:`F_{P}` denote the number of false positives, then the model
        precision is given by: :math:`\frac {T_{P}} {T_{P} + F_{P}}`.

        *   The **accuracy** of a classification model is the proportion of
        predictions that are correctly identified.
        If we let :math:`T_{P}` denote the number of true positives,
        :math:`T_{N}` denote the number of true negatives, and :math:`K` denote
        the total number of classified instances, then the model accuracy is
        given by: :math:`\frac{T_{P} + T_{N}}{K}`.

    *   The **confusion_matrix** result is a confusion matrix for a
        binary classifier model, formatted for human readability.

    Examples
    --------

    Consider Frame *my_frame*, which contains the data

        >>> my_frame.inspect()
        [#]  a      b  labels  predictions
        ==================================
        [0]  red    1       0            0
        [1]  blue   3       1            0
        [2]  green  1       0            0
        [3]  green  0       1            1

        >>> cm = my_frame.binary_classification_metrics('labels', 'predictions', 1, 1)
        [===Job Progress===]

        >>> cm.f_measure
        0.6666666666666666

        >>> cm.recall
        0.5

        >>> cm.accuracy
        0.75

        >>> cm.precision
        1.0

        >>> cm.confusion_matrix
                    Predicted_Pos  Predicted_Neg
        Actual_Pos              1              1
        Actual_Neg              0              2

    """
    return ClassificationMetricsValue(self._tc, self._scala.binaryClassificationMetrics(label_column,
                                      pred_column,
                                      pos_label,
                                      float(beta),
                                      self._tc.jutils.convert.to_scala_option(frequency_column)))

def box_cox(

self, column_name, lambda_value=0.0, box_cox_column_name=None)

Calculate the box-cox transformation for each row on a given column of the current frame

Parameters:

column_name:

Name of the column to perform the transformation on

lambda_value:

Lambda power parameter. Default is 0.0

box_cox_column_name:

Optional column name for the box_cox value

Returns

(Frame):

returns a frame with a new column storing the box-cox transformed value

Calculate the box-cox transformation for each row in column 'column_name' of a frame using the lambda_value.

Box-cox transformation is computed by the following formula:

boxcox = log(y); if lambda=0, boxcox = (y^lambda -1)/lambda ; else where log is the natural log

Examples:

>>> data = [[7.7132064326674596],[0.207519493594015],[6.336482349262754],[7.4880388253861181],[4.9850701230259045]]
>>> schema = [("input", float)]
>>> my_frame = tc.frame.create(data, schema)
>>> my_frame.inspect()
[#]  input
===================
[0]   7.71320643267
[1]  0.207519493594
[2]   6.33648234926
[3]   7.48803882539
[4]   4.98507012303

Compute the box-cox transformation on the 'input' column
>>> my_frame.box_cox('input',0.3)

A new column gets added to the frame which stores the box-cox transformation for each row
>>> my_frame.inspect()
[#]  input           input_lambda_0.3
=====================================
[0]   7.71320643267     2.81913279907
[1]  0.207519493594    -1.25365381375
[2]   6.33648234926     2.46673638752
[3]   7.48803882539     2.76469126003
[4]   4.98507012303     2.06401101556

Show source ≡

def box_cox(self, column_name, lambda_value=0.0, box_cox_column_name=None):

    """

    Calculate the box-cox transformation for each row on a given column of the current frame

    Parameters
    ----------

    :param column_name: Name of the column to perform the transformation on
    :param lambda_value: Lambda power parameter. Default is 0.0
    :param box_cox_column_name: Optional column name for the box_cox value
    :return: (Frame) returns a frame with a new column storing the box-cox transformed value

    Calculate the box-cox transformation for each row in column 'column_name' of a frame using the lambda_value.

    Box-cox transformation is computed by the following formula:

    boxcox = log(y); if lambda=0,
    boxcox = (y^lambda -1)/lambda ; else
    where log is the natural log

    Examples
    --------

        >>> data = [[7.7132064326674596],[0.207519493594015],[6.336482349262754],[7.4880388253861181],[4.9850701230259045]]
        >>> schema = [("input", float)]
        >>> my_frame = tc.frame.create(data, schema)
        >>> my_frame.inspect()
        [#]  input
        ===================
        [0]   7.71320643267
        [1]  0.207519493594
        [2]   6.33648234926
        [3]   7.48803882539
        [4]   4.98507012303

        Compute the box-cox transformation on the 'input' column
        >>> my_frame.box_cox('input',0.3)

        A new column gets added to the frame which stores the box-cox transformation for each row
        >>> my_frame.inspect()
        [#]  input           input_lambda_0.3
        =====================================
        [0]   7.71320643267     2.81913279907
        [1]  0.207519493594    -1.25365381375
        [2]   6.33648234926     2.46673638752
        [3]   7.48803882539     2.76469126003
        [4]   4.98507012303     2.06401101556

    """
    
    self._scala.boxCox(column_name, lambda_value, self._tc.jutils.convert.to_scala_option(box_cox_column_name))

def categorical_summary(

self, columns, top_k=None, threshold=None)

Build summary of the data.

Parameters:

columns

(List[CategoricalSummaryInput]):

List of CategoricalSummaryInput consisting of column, topk and/or threshold

top_k

(Optional[int]):

Displays levels which are in the top k most frequently occurring values for that column. Default is 10.

threshold

(Optional[float]):

Displays levels which are above the threshold percentage with respect to the total row count. Default is 0.0.

Returns

(List[CategoricalSummaryOutput]):

List of CategoricalSummaryOutput objects for specified column(s) consisting of levels with their frequency and percentage.

Compute a summary of the data in a column(s) for categorical or numerical data types. The returned value is a Map containing categorical summary for each specified column.

For each column, levels which satisfy the top k and/or threshold cutoffs are displayed along with their frequency and percentage occurrence with respect to the total rows in the dataset.

Performs level pruning first based on top k and then filters out levels which satisfy the threshold criterion.

Missing data is reported when a column value is empty ("") or null.

All remaining data is grouped together in the Other category and its frequency and percentage are reported as well.

User must specify the column name and can optionally specify top_k and/or threshold.

Examples:

Consider Frame my_frame, which contains the data

>>> my_frame.inspect()
[#]  source           target
=====================================
[0]  entity           thing
[1]  entity           physical_entity
[2]  entity           abstraction
[3]  physical_entity  entity
[4]  physical_entity  matter
[5]  physical_entity  process
[6]  physical_entity  thing
[7]  physical_entity  substance
[8]  physical_entity  object
[9]  physical_entity  causal_agent

>>> cm = my_frame.categorical_summary('source', top_k=2)
[===Job Progress===]

>>> cm
column_name = "source"
[#]  level        frequency  percentage
===========================================
[0]  thing                9  0.321428571429
[1]  abstraction          9  0.321428571429
[2]  <Missing>            0             0.0
[3]  <Other>             10  0.357142857143

>>> cm = my_frame.categorical_summary('source', threshold = 0.5)
[===Job Progress===]

>>> cm
column_name = "source"
[#]  level      frequency  percentage
=====================================
[0]  <Missing>          0         0.0
[1]  <Other>           28         1.0

>>> cm = my_frame.categorical_summary(['source', 'target'], top_k=[2, None], threshold=[None, 0.5])
[===Job Progress===]

>>> cm
column_name = "source"
[#]  level        frequency  percentage
===========================================
[0]  thing                9  0.321428571429
[1]  abstraction          9  0.321428571429
[2]  <Missing>            0             0.0
[3]  <Other>             10  0.357142857143
<BLANKLINE>
column_name = "target"
[#]  level      frequency  percentage
=====================================
[0]  <Missing>          0         0.0
[1]  <Other>           28         1.0

Show source ≡

def categorical_summary(self, columns, top_k=None, threshold=None):
    """
    Build summary of the data.

    Parameters
    ----------

    :param columns: (List[CategoricalSummaryInput]) List of CategoricalSummaryInput consisting of column, topk and/or threshold
    :param top_k: (Optional[int]) Displays levels which are in the top k most frequently
            occurring values for that column.
            Default is 10.
    :param threshold: (Optional[float]) Displays levels which are above the threshold percentage with
            respect to the total row count.
            Default is 0.0.
    :return: (List[CategoricalSummaryOutput]) List of CategoricalSummaryOutput objects for specified column(s) consisting of levels with
             their frequency and percentage.

    Compute a summary of the data in a column(s) for categorical or numerical data types.
    The returned value is a Map containing categorical summary for each specified column.

    For each column, levels which satisfy the top k and/or threshold cutoffs are
    displayed along with their frequency and percentage occurrence with respect to
    the total rows in the dataset.

    Performs level pruning first based on top k and then filters
    out levels which satisfy the threshold criterion.

    Missing data is reported when a column value is empty ("") or null.

    All remaining data is grouped together in the Other category and its frequency
    and percentage are reported as well.

    User must specify the column name and can optionally specify top_k and/or threshold.

    Examples
    --------

    Consider Frame *my_frame*, which contains the data


        >>> my_frame.inspect()
        [#]  source           target
        =====================================
        [0]  entity           thing
        [1]  entity           physical_entity
        [2]  entity           abstraction
        [3]  physical_entity  entity
        [4]  physical_entity  matter
        [5]  physical_entity  process
        [6]  physical_entity  thing
        [7]  physical_entity  substance
        [8]  physical_entity  object
        [9]  physical_entity  causal_agent

        >>> cm = my_frame.categorical_summary('source', top_k=2)
        [===Job Progress===]

        >>> cm
        column_name = "source"
        [#]  level        frequency  percentage
        ===========================================
        [0]  thing                9  0.321428571429
        [1]  abstraction          9  0.321428571429
        [2]              0             0.0
        [3]               10  0.357142857143

        >>> cm = my_frame.categorical_summary('source', threshold = 0.5)
        [===Job Progress===]

        >>> cm
        column_name = "source"
        [#]  level      frequency  percentage
        =====================================
        [0]            0         0.0
        [1]             28         1.0

        >>> cm = my_frame.categorical_summary(['source', 'target'], top_k=[2, None], threshold=[None, 0.5])
        [===Job Progress===]

        >>> cm
        column_name = "source"
        [#]  level        frequency  percentage
        ===========================================
        [0]  thing                9  0.321428571429
        [1]  abstraction          9  0.321428571429
        [2]              0             0.0
        [3]               10  0.357142857143
        
        column_name = "target"
        [#]  level      frequency  percentage
        =====================================
        [0]            0         0.0
        [1]             28         1.0

    """
    if not isinstance(columns, list):
        columns = [columns]
    columns = self._tc.jutils.convert.to_scala_list_string(columns)

    if top_k is not None:
        if not isinstance(top_k, list):
            top_k = [top_k]
        top_k = [self._tc.jutils.convert.to_scala_option(item) for item in top_k]
        top_k = self._tc.jutils.convert.to_scala_list(top_k)
    if threshold is not None:
        if not isinstance(threshold, list):
            threshold = [threshold]
        threshold = [self._tc.jutils.convert.to_scala_option(item) for item in threshold]
        threshold = self._tc.jutils.convert.to_scala_list(threshold)
    result_list = list(self._scala.categoricalSummary(columns,
                                                      self._tc.jutils.convert.to_scala_option(top_k),
                                                      self._tc.jutils.convert.to_scala_option(threshold)))
    return CategoricalSummaryOutputList([CategoricalSummaryOutput(item) for item in result_list])

def collect(

self, columns=None)

Brings all the rows of data from the frame into a local python list of lists

(Use the 'take' operation for control over row count and offset of the collected data)

Parameters:

columns

(Optional[str or List[str]):

If not None, only the given columns' data will be provided. By default, all columns are included.

Returns

(List[List[*]]):

the frame data represented as a list of lists

Examples:

>>> schema = [('name',str), ('age', int), ('tenure', int), ('phone', str)]
>>> rows = [['Fred', 39, 16, '555-1234'], ['Susan', 33, 3, '555-0202'], ['Thurston', 65, 26, '555-4510'], ['Judy', 44, 14, '555-2183']]
>>> frame = tc.frame.create(rows, schema)
>>> frame.collect()
[['Fred', 39, 16, '555-1234'], ['Susan', 33, 3, '555-0202'], ['Thurston', 65, 26, '555-4510'], ['Judy', 44, 14, '555-2183']]

>>> frame.collect(['name', 'phone'])
[['Fred', '555-1234'], ['Susan', '555-0202'], ['Thurston', '555-4510'], ['Judy', '555-2183']]

Show source ≡

def collect(self, columns=None):
    """
    Brings all the rows of data from the frame into a local python list of lists

    (Use the 'take' operation for control over row count and offset of the collected data)

    Parameters
    ----------

    :param columns: (Optional[str or List[str]) If not None, only the given columns' data will be provided.
                    By default, all columns are included.
    :return: (List[List[*]]) the frame data represented as a list of lists

    Examples
    --------

        >>> schema = [('name',str), ('age', int), ('tenure', int), ('phone', str)]
        >>> rows = [['Fred', 39, 16, '555-1234'], ['Susan', 33, 3, '555-0202'], ['Thurston', 65, 26, '555-4510'], ['Judy', 44, 14, '555-2183']]
        >>> frame = tc.frame.create(rows, schema)
        >>> frame.collect()
        [['Fred', 39, 16, '555-1234'], ['Susan', 33, 3, '555-0202'], ['Thurston', 65, 26, '555-4510'], ['Judy', 44, 14, '555-2183']]

        >>> frame.collect(['name', 'phone'])
        [['Fred', '555-1234'], ['Susan', '555-0202'], ['Thurston', '555-4510'], ['Judy', '555-2183']]


    """
    if columns is not None:
        affirm_type.list_of_str(columns, "columns")
        if not columns:
            return []
    if self._is_scala:
        scala_data = self._scala.collect(self._tc.jutils.convert.to_scala_option_list_string(columns))
        schema = get_schema_for_columns(self.schema, columns) if columns else self.schema
        data = TakeCollectHelper.scala_rows_to_python(self._tc, scala_data, schema)
    else:
        if columns:
            select = TakeCollectHelper.get_select_columns_function(self.schema, columns)
            data = self._python.rdd.map(select).collect()
        else:
            data = self._python.rdd.collect()
    return data

def column_median(

self, data_column, weights_column=None)

Calculate the (weighted) median of a column.

The median is the least value X in the range of the distribution so that the cumulative weight of values strictly below X is strictly less than half of the total weight and the cumulative weight of values up to and including X is greater than or equal to one-half of the total weight.

All data elements of weight less than or equal to 0 are excluded from the calculation, as are all data elements whose weight is NaN or infinite. If a weight column is provided and no weights are finite numbers greater than 0, None is returned.

Parameters:

data_column

(str):

The column whose median is to be calculated.

weights_column

(Option[str]):

The column that provides weights (frequencies) for the median calculation. Must contain numerical data. Default is all items have a weight of 1.

Returns

(varies):

The median of the values. If a weight column is provided and no weights are finite numbers greater than 0, None is returned. The type of the median returned is the same as the contents of the data column, so a column of Longs will result in a Long median and a column of Floats will result in a Float median.

Examples:

Given a frame with column 'a' accessed by a Frame object 'my_frame':

>>> data = [[2],[3],[3],[5],[7],[10],[30]]
>>> schema = [('a', int)]
>>> my_frame = tc.frame.create(data, schema)
[===Job Progress===]

Inspect my_frame

>>> my_frame.inspect()
[#]  a
=======
[0]   2
[1]   3
[2]   3
[3]   5
[4]   7
[5]  10
[6]  30

Compute and return middle number of values in column a:

>>> median = my_frame.column_median('a')
[===Job Progress===]
>>> print median
5

Given a frame with column 'a' and column 'w' as weights accessed by a Frame object 'my_frame':

>>> data = [[2,1.7],[3,0.5],[3,1.2],[5,0.8],[7,1.1],[10,0.8],[30,0.1]]
>>> schema = [('a', int), ('w', float)]
>>> my_frame = tc.frame.create(data, schema)
[===Job Progress===]

Inspect my_frame

>>> my_frame.inspect()
[#]  a   w
============
[0]   2  1.7
[1]   3  0.5
[2]   3  1.2
[3]   5  0.8
[4]   7  1.1
[5]  10  0.8
[6]  30  0.1

Compute and return middle number of values in column 'a' with weights 'w':

>>> median = my_frame.column_median('a', weights_column='w')
[===Job Progress===]
>>> print median
3

Show source ≡

def column_median(self, data_column, weights_column=None):
    """
    Calculate the (weighted) median of a column.

    The median is the least value X in the range of the distribution so that
    the cumulative weight of values strictly below X is strictly less than half
    of the total weight and the cumulative weight of values up to and including X
    is greater than or equal to one-half of the total weight.

    All data elements of weight less than or equal to 0 are excluded from the
    calculation, as are all data elements whose weight is NaN or infinite.
    If a weight column is provided and no weights are finite numbers greater
    than 0, None is returned.

    Parameters
    ----------

    :param data_column: (str) The column whose median is to be calculated.
    :param weights_column: (Option[str]) The column that provides weights (frequencies) for the median calculation.
                           Must contain numerical data.
                           Default is all items have a weight of 1.
    :return: (varies) The median of the values.
             If a weight column is provided and no weights are finite numbers greater
             than 0, None is returned.
             The type of the median returned is the same as the contents of the data
             column, so a column of Longs will result in a Long median and a column of
             Floats will result in a Float median.

    Examples
    --------

    Given a frame with column 'a' accessed by a Frame object 'my_frame':

        >>> data = [[2],[3],[3],[5],[7],[10],[30]]
        >>> schema = [('a', int)]
        >>> my_frame = tc.frame.create(data, schema)
        [===Job Progress===]

    Inspect my_frame

        >>> my_frame.inspect()
        [#]  a
        =======
        [0]   2
        [1]   3
        [2]   3
        [3]   5
        [4]   7
        [5]  10
        [6]  30

    Compute and return middle number of values in column *a*:

        >>> median = my_frame.column_median('a')
        [===Job Progress===]
        >>> print median
        5

    Given a frame with column 'a' and column 'w' as weights accessed by a Frame object 'my_frame':

        >>> data = [[2,1.7],[3,0.5],[3,1.2],[5,0.8],[7,1.1],[10,0.8],[30,0.1]]
        >>> schema = [('a', int), ('w', float)]
        >>> my_frame = tc.frame.create(data, schema)
        [===Job Progress===]

    Inspect my_frame

        >>> my_frame.inspect()
        [#]  a   w
        ============
        [0]   2  1.7
        [1]   3  0.5
        [2]   3  1.2
        [3]   5  0.8
        [4]   7  1.1
        [5]  10  0.8
        [6]  30  0.1

    Compute and return middle number of values in column 'a' with weights 'w':

        >>> median = my_frame.column_median('a', weights_column='w')
        [===Job Progress===]
        >>> print median
        3

    """
    val = self._scala.columnMedian(data_column, self._tc.jutils.convert.to_scala_option(weights_column))
    optional_val = self._tc.jutils.convert.from_scala_option(val)
    if optional_val is None:
        return None
    else:
        return optional_val.value()

def column_mode(

self, data_column, weights_column=None, max_modes_returned=None)

Evaluate the weights assigned to rows.

Calculate the modes of a column. A mode is a data element of maximum weight. All data elements of weight less than or equal to 0 are excluded from the calculation, as are all data elements whose weight is NaN or infinite. If there are no data elements of finite weight greater than 0, no mode is returned.

Because data distributions often have multiple modes, it is possible for a set of modes to be returned. By default, only one is returned, but by setting the optional parameter max_modes_returned, a larger number of modes can be returned.

Parameters:

data_column

(str):

Name of the column supplying the data.

weights_column

(Optional[str]):

Name of the column supplying the weights. Default is all items have weight of 1.

max_modes_returned

(Option[int]):

Maximum number of modes returned. Default is 1.

Returns

(ColumnMode):

ColumnMode object which includes multiple components (mode, weight_of_mode, total_weight, and mode_count).

The data returned is composed of multiple components\:

mode : A mode is a data element of maximum net weight. A set of modes is returned. The empty set is returned when the sum of the weights is 0. If the number of modes is less than or equal to the parameter max_modes_returned, then all modes of the data are returned. If the number of modes is greater than the max_modes_returned parameter, only the first max_modes_returned many modes (per a canonical ordering) are returned. weight_of_mode : Weight of a mode. If there are no data elements of finite weight greater than 0, the weight of the mode is 0. If no weights column is given, this is the number of appearances of each mode. total_weight : Sum of all weights in the weight column. This is the row count if no weights are given. If no weights column is given, this is the number of rows in the table with non-zero weight. mode_count : The number of distinct modes in the data. In the case that the data is very multimodal, this number may exceed max_modes_returned.

Examples:

Given a frame with column 'a' accessed by a Frame object 'my_frame':

>>> data = [[2],[3],[3],[5],[7],[10],[30]]
>>> schema = [('a', int)]
>>> my_frame = tc.frame.create(data, schema)
[===Job Progress===]

Inspect my_frame

>>> my_frame.inspect()
[#]  a
=======
[0]   2
[1]   3
[2]   3
[3]   5
[4]   7
[5]  10
[6]  30

Compute and return a ColumnMode object containing summary statistics of column a:

>>> mode = my_frame.column_mode('a')
[===Job Progress===]
>>> print mode
mode_count     = 1
modes          = [3]
total_weight   = 7.0
weight_of_mode = 2.0

Given a frame with column 'a' and column 'w' as weights accessed by a Frame object 'my_frame':

>>> data = [[2,1.7],[3,0.5],[3,1.2],[5,0.8],[7,1.1],[10,0.8],[30,0.1]]
>>> schema = [('a', int), ('w', float)]
>>> my_frame = tc.frame.create(data, schema)
[===Job Progress===]

Inspect my_frame

>>> my_frame.inspect()
[#]  a   w
============
[0]   2  1.7
[1]   3  0.5
[2]   3  1.2
[3]   5  0.8
[4]   7  1.1
[5]  10  0.8
[6]  30  0.1

Compute and return ColumnMode object containing summary statistics of column 'a' with weights 'w':

>>> mode = my_frame.column_mode('a', weights_column='w')
[===Job Progress===]
>>> print mode
mode_count     = 2
modes          = [2]
total_weight   = 6.2
weight_of_mode = 1.7

Show source ≡

def column_mode(self, data_column, weights_column=None, max_modes_returned=None):
    """
    Evaluate the weights assigned to rows.

    Calculate the modes of a column.
    A mode is a data element of maximum weight.
    All data elements of weight less than or equal to 0 are excluded from the
    calculation, as are all data elements whose weight is NaN or infinite.
    If there are no data elements of finite weight greater than 0,
    no mode is returned.

    Because data distributions often have multiple modes, it is possible for a
    set of modes to be returned.
    By default, only one is returned, but by setting the optional parameter
    max_modes_returned, a larger number of modes can be returned.

    Parameters
    ----------

    :param data_column: (str) Name of the column supplying the data.
    :param weights_column: (Optional[str]) Name of the column supplying the weights.
                           Default is all items have weight of 1.
    :param max_modes_returned: (Option[int]) Maximum number of modes returned. Default is 1.
    :return: (ColumnMode) ColumnMode object which includes multiple components (mode, weight_of_mode, total_weight,
                          and mode_count).

    The data returned is composed of multiple components\:

    mode : A mode is a data element of maximum net weight.
        A set of modes is returned.
        The empty set is returned when the sum of the weights is 0.
        If the number of modes is less than or equal to the parameter
        max_modes_returned, then all modes of the data are
        returned.
        If the number of modes is greater than the max_modes_returned
        parameter, only the first max_modes_returned many modes (per a
        canonical ordering) are returned.
    weight_of_mode : Weight of a mode.
        If there are no data elements of finite weight greater than 0,
        the weight of the mode is 0.
        If no weights column is given, this is the number of appearances
        of each mode.
    total_weight : Sum of all weights in the weight column.
        This is the row count if no weights are given.
        If no weights column is given, this is the number of rows in
        the table with non-zero weight.
    mode_count : The number of distinct modes in the data.
        In the case that the data is very multimodal, this number may
        exceed max_modes_returned.

    Examples
    --------

    Given a frame with column 'a' accessed by a Frame object 'my_frame':

        >>> data = [[2],[3],[3],[5],[7],[10],[30]]
        >>> schema = [('a', int)]
        >>> my_frame = tc.frame.create(data, schema)
        [===Job Progress===]

    Inspect my_frame

        >>> my_frame.inspect()
        [#]  a
        =======
        [0]   2
        [1]   3
        [2]   3
        [3]   5
        [4]   7
        [5]  10
        [6]  30

    Compute and return a ColumnMode object containing summary statistics of column *a*:

        >>> mode = my_frame.column_mode('a')
        [===Job Progress===]
        >>> print mode
        mode_count     = 1
        modes          = [3]
        total_weight   = 7.0
        weight_of_mode = 2.0

    Given a frame with column 'a' and column 'w' as weights accessed by a Frame object 'my_frame':

        >>> data = [[2,1.7],[3,0.5],[3,1.2],[5,0.8],[7,1.1],[10,0.8],[30,0.1]]
        >>> schema = [('a', int), ('w', float)]
        >>> my_frame = tc.frame.create(data, schema)
        [===Job Progress===]

    Inspect my_frame

        >>> my_frame.inspect()
        [#]  a   w
        ============
        [0]   2  1.7
        [1]   3  0.5
        [2]   3  1.2
        [3]   5  0.8
        [4]   7  1.1
        [5]  10  0.8
        [6]  30  0.1

    Compute and return ColumnMode object containing summary statistics of column 'a' with weights 'w':

        >>> mode = my_frame.column_mode('a', weights_column='w')
        [===Job Progress===]
        >>> print mode
        mode_count     = 2
        modes          = [2]
        total_weight   = 6.2
        weight_of_mode = 1.7

    """
    return ColumnMode(self._scala.columnMode(data_column,
                      self._tc.jutils.convert.to_scala_option(weights_column),
                      self._tc.jutils.convert.to_scala_option(max_modes_returned)))

def column_summary_statistics(

self, data_column, weights_column=None, use_popultion_variance=False)

Calculate multiple statistics for a column.

Parameters:

data_column

(str):

The column to be statistically summarized. Must contain numerical data; all NaNs and infinite values are excluded from the calculation.

weights_column

(Optional[str]):

Name of column holding weights of column values.

use_popultion_variance

(Optional[bool]):

If true, the variance is calculated as the population variance. If false, the variance calculated as the sample variance. Because this option affects the variance, it affects the standard deviation and the confidence intervals as well. Default is false.

Returns

(ColumnSummaryStatistics):

ColumnSummaryStatistics object containing summary statistics.

The data returned is composed of multiple components:

mean : [ double | None ]
Arithmetic mean of the data.
geometric_mean : [ double | None ]
Geometric mean of the data. None when there is a data element <= 0, 1.0 when there are no data elements.
variance : [ double | None ]
None when there are <= 1 many data elements. Sample variance is the weighted sum of the squared distance of each data element from the weighted mean, divided by the total weight minus 1. None when the sum of the weights is <= 1. Population variance is the weighted sum of the squared distance of each data element from the weighted mean, divided by the total weight.
standard_deviation : [ double | None ]
The square root of the variance. None when sample variance is being used and the sum of weights is <= 1.
total_weight : long
The count of all data elements that are finite numbers. In other words, after excluding NaNs and infinite values.
minimum : [ double | None ]
Minimum value in the data. None when there are no data elements.
maximum : [ double | None ]
Maximum value in the data. None when there are no data elements.
mean_confidence_lower : [ double | None ]
Lower limit of the 95% confidence interval about the mean. Assumes a Gaussian distribution. None when there are no elements of positive weight.
mean_confidence_upper : [ double | None ]
Upper limit of the 95% confidence interval about the mean. Assumes a Gaussian distribution. None when there are no elements of positive weight.
bad_row_count : [ double | None ]
The number of rows containing a NaN or infinite value in either the data or weights column.
good_row_count : [ double | None ]
The number of rows not containing a NaN or infinite value in either the data or weights column.
positive_weight_count : [ double | None ]
The number of valid data elements with weight > 0. This is the number of entries used in the statistical calculation.
non_positive_weight_count : [ double | None ]
The number valid data elements with finite weight <= 0.

Notes:

Sample Variance
Sample Variance is computed by the following formula:

.. math::
```
\left( rac{1}{W - 1}
```
ight) * sum_{i} \left(x_{i} - M ight) ^{2}

where :math:W is sum of weights over valid elements of positive weight, and :math:M is the weighted mean.
Population Variance
Population Variance is computed by the following formula:

.. math::
```
\left( rac{1}{W}
```
ight) * sum_{i} \left(x_{i} - M ight) ^{2}

where :math:W is sum of weights over valid elements of positive weight, and :math:M is the weighted mean.
Standard Deviation
The square root of the variance.
Logging Invalid Data
A row is bad when it contains a NaN or infinite value in either its data or weights column. In this case, it contributes to bad_row_count; otherwise it contributes to good row count.

A good row can be skipped because the value in its weight column is less than or equal to 0. In this case, it contributes to non_positive_weight_count, otherwise (when the weight is greater than 0) it contributes to valid_data_weight_pair_count.

Equations

bad_row_count + good_row_count = # rows in the frame
positive_weight_count + non_positive_weight_count = good_row_count

In particular, when no weights column is provided and all weights are 1.0:

non_positive_weight_count = 0 and
positive_weight_count = good_row_count

Examples:

Given a frame with column 'a' accessed by a Frame object 'my_frame':

>>> data = [[2],[3],[3],[5],[7],[10],[30]]
>>> schema = [('a', int)]
>>> my_frame = tc.frame.create(data, schema)
[===Job Progress===]

Inspect my_frame

>>> my_frame.inspect()
[#]  a
=======
[0]   2
[1]   3
[2]   3
[3]   5
[4]   7
[5]  10
[6]  30

Compute and return summary statistics for values in column a:

>>> summary_statistics = my_frame.column_summary_statistics('a')
[===Job Progress===]
>>> print summary_statistics
bad_row_count             = 0
geometric_mean            = 5.67257514519
good_row_count            = 7
maximum                   = 30.0
mean                      = 8.57142857143
mean_confidence_lower     = 1.27708372993
mean_confidence_upper     = 15.8657734129
minimum                   = 2.0
non_positive_weight_count = 0
positive_weight_count     = 7
standard_deviation        = 9.84644001416
total_weight              = 7.0
variance                  = 96.9523809524

Given a frame with column 'a' and column 'w' as weights accessed by a Frame object 'my_frame':

>>> data = [[2,1.7],[3,0.5],[3,1.2],[5,0.8],[7,1.1],[10,0.8],[30,0.1]]
>>> schema = [('a', int), ('w', float)]
>>> my_frame = tc.frame.create(data, schema)
[===Job Progress===]

Inspect my_frame

>>> my_frame.inspect()
[#]  a   w
============
[0]   2  1.7
[1]   3  0.5
[2]   3  1.2
[3]   5  0.8
[4]   7  1.1
[5]  10  0.8
[6]  30  0.1

Compute and return summary statistics values in column 'a' with weights 'w':

>>> summary_statistics = my_frame.column_summary_statistics('a', weights_column='w')
[===Job Progress===]
>>> print summary_statistics
bad_row_count             = 0
geometric_mean            = 4.03968288152
good_row_count            = 7
maximum                   = 30.0
mean                      = 5.03225806452
mean_confidence_lower     = 1.42847242276
mean_confidence_upper     = 8.63604370627
minimum                   = 2.0
non_positive_weight_count = 0
positive_weight_count     = 7
standard_deviation        = 4.57824177679
total_weight              = 6.2
variance                  = 20.9602977667

Show source ≡

def column_summary_statistics(self, data_column, weights_column=None, use_popultion_variance=False):
    """
    Calculate multiple statistics for a column.

    Parameters
    ----------

    :param data_column: (str) The column to be statistically summarized.
                        Must contain numerical data; all NaNs and infinite values are excluded from the calculation.
    :param weights_column: (Optional[str]) Name of column holding weights of column values.
    :param use_popultion_variance: (Optional[bool]) If true, the variance is calculated as the population variance.
                                   If false, the variance calculated as the sample variance.
                                   Because this option affects the variance, it affects the standard deviation and
                                   the confidence intervals as well.
                                   Default is false.
    :return: (ColumnSummaryStatistics) ColumnSummaryStatistics object containing summary statistics.

    The data returned is composed of multiple components:

    * mean : [ double | None ]

        Arithmetic mean of the data.
    * geometric_mean : [ double | None ]

        Geometric mean of the data. None when there is a data element <= 0, 1.0 when there are no data elements.
    * variance : [ double | None ]

        None when there are <= 1 many data elements. Sample variance is the weighted sum of the squared distance of each data element from the weighted mean, divided by the total weight minus 1. None when the sum of the weights is <= 1. Population variance is the weighted sum of the squared distance of each data element from the weighted mean, divided by the total weight.
    * standard_deviation : [ double | None ]

        The square root of the variance. None when  sample variance is being used and the sum of weights is <= 1.
    * total_weight : long

        The count of all data elements that are finite numbers. In other words, after excluding NaNs and infinite values.
    * minimum : [ double | None ]

        Minimum value in the data. None when there are no data elements.
    * maximum : [ double | None ]

        Maximum value in the data. None when there are no data elements.
    * mean_confidence_lower : [ double | None ]

        Lower limit of the 95% confidence interval about the mean. Assumes a Gaussian distribution. None when there are no elements of positive weight.
    * mean_confidence_upper : [ double | None ]

        Upper limit of the 95% confidence interval about the mean. Assumes a Gaussian distribution. None when there are no elements of positive weight.
    * bad_row_count : [ double | None ]

        The number of rows containing a NaN or infinite value in either the data or weights column.
    * good_row_count : [ double | None ]

        The number of rows not containing a NaN or infinite value in either the data or weights column.
    * positive_weight_count : [ double | None ]

        The number of valid data elements with weight > 0. This is the number of entries used in the statistical calculation.
    * non_positive_weight_count : [ double | None ]

        The number valid data elements with finite weight <= 0.

    Notes
    -----

    * Sample Variance

        Sample Variance is computed by the following formula:

        .. math::

            \left( \frac{1}{W - 1} \right) * sum_{i} \
            \left(x_{i} - M \right) ^{2}

        where :math:`W` is sum of weights over valid elements of positive
        weight, and :math:`M` is the weighted mean.

    * Population Variance

        Population Variance is computed by the following formula:

        .. math::

            \left( \frac{1}{W} \right) * sum_{i} \
            \left(x_{i} - M \right) ^{2}

        where :math:`W` is sum of weights over valid elements of positive
        weight, and :math:`M` is the weighted mean.

    * Standard Deviation

        The square root of the variance.

    * Logging Invalid Data

        A row is bad when it contains a NaN or infinite value in either
        its data or weights column.
        In this case, it contributes to bad_row_count; otherwise it
        contributes to good row count.

        A good row can be skipped because the value in its weight
        column is less than or equal to 0.
        In this case, it contributes to non_positive_weight_count, otherwise
        (when the weight is greater than 0) it contributes to
        valid_data_weight_pair_count.

    **Equations**

        bad_row_count + good_row_count = # rows in the frame
        positive_weight_count + non_positive_weight_count = good_row_count

    In particular, when no weights column is provided and all weights are 1.0:

        non_positive_weight_count = 0 and
        positive_weight_count = good_row_count

    Examples
    --------

    Given a frame with column 'a' accessed by a Frame object 'my_frame':

        >>> data = [[2],[3],[3],[5],[7],[10],[30]]
        >>> schema = [('a', int)]
        >>> my_frame = tc.frame.create(data, schema)
        [===Job Progress===]

    Inspect my_frame

        >>> my_frame.inspect()
        [#]  a
        =======
        [0]   2
        [1]   3
        [2]   3
        [3]   5
        [4]   7
        [5]  10
        [6]  30

    Compute and return summary statistics for values in column *a*:

        >>> summary_statistics = my_frame.column_summary_statistics('a')
        [===Job Progress===]
        >>> print summary_statistics
        bad_row_count             = 0
        geometric_mean            = 5.67257514519
        good_row_count            = 7
        maximum                   = 30.0
        mean                      = 8.57142857143
        mean_confidence_lower     = 1.27708372993
        mean_confidence_upper     = 15.8657734129
        minimum                   = 2.0
        non_positive_weight_count = 0
        positive_weight_count     = 7
        standard_deviation        = 9.84644001416
        total_weight              = 7.0
        variance                  = 96.9523809524

    Given a frame with column 'a' and column 'w' as weights accessed by a Frame object 'my_frame':

        >>> data = [[2,1.7],[3,0.5],[3,1.2],[5,0.8],[7,1.1],[10,0.8],[30,0.1]]
        >>> schema = [('a', int), ('w', float)]
        >>> my_frame = tc.frame.create(data, schema)
        [===Job Progress===]

    Inspect my_frame

        >>> my_frame.inspect()
        [#]  a   w
        ============
        [0]   2  1.7
        [1]   3  0.5
        [2]   3  1.2
        [3]   5  0.8
        [4]   7  1.1
        [5]  10  0.8
        [6]  30  0.1

    Compute and return summary statistics values in column 'a' with weights 'w':

        >>> summary_statistics = my_frame.column_summary_statistics('a', weights_column='w')
        [===Job Progress===]
        >>> print summary_statistics
        bad_row_count             = 0
        geometric_mean            = 4.03968288152
        good_row_count            = 7
        maximum                   = 30.0
        mean                      = 5.03225806452
        mean_confidence_lower     = 1.42847242276
        mean_confidence_upper     = 8.63604370627
        minimum                   = 2.0
        non_positive_weight_count = 0
        positive_weight_count     = 7
        standard_deviation        = 4.57824177679
        total_weight              = 6.2
        variance                  = 20.9602977667

    """
    return ColumnSummaryStatistics(self._scala.columnSummaryStatistics(data_column,
                                                                       self._tc.jutils.convert.to_scala_option(weights_column),
                                                                       use_popultion_variance))

def copy(

self, columns=None, where=None)

New frame with copied columns.

Parameters:

columns

(str, List[str], or dictionary(str,str)):

If not None, the copy will only include the columns specified. If dict, the string pairs represent a column renaming { source_column_name : destination_column_name }

where

(UDF):

Optionally provide a where function. If not None, only those rows for which the UDF evaluates to True will be copied.

Returns

(Frame):

New Frame object.

Copies specified columns into a new Frame object, optionally renaming them and/or filtering them. Useful for frame query.

Examples:

Consider the following frame of employee names, age, and years of service:

>>> frame.inspect()
[#]  name      age  years
=========================
[0]  Thurston   64     26
[1]  Judy       44     14
[2]  Emily      37      5
[3]  Frank      50     18
[4]  Joe        43     11
[5]  Ruth       52     21

>>> frame.schema
[('name', <type 'str'>), ('age', <type 'int'>), ('years', <type 'int'>)]

To create a duplicate copy of the frame, use the copy operation with no parameters:

>>> duplicate = frame.copy()
[===Job Progress===]

>>> duplicate.inspect()
[#]  name      age  years
=========================
[0]  Thurston   64     26
[1]  Judy       44     14
[2]  Emily      37      5
[3]  Frank      50     18
[4]  Joe        43     11
[5]  Ruth       52     21

Using the copy operation, we can also limit the new frame to just include the 'name' column:

>>> names = frame.copy("name")
[===Job Progress===]

>>> names.inspect()
[#]  name
=============
[0]  Thurston
[1]  Judy
[2]  Emily
[3]  Frank
[4]  Joe
[5]  Ruth

We could also include a UDF to filter the data that is included in the new frame, and also provide a dictionary to rename the column(s) in the new frame. Here we will use copy to create a frame of names for the employees that have over 20 years of service and also rename of the 'name' column to 'first_name':

>>> names = frame.copy({"name" : "first_name"}, lambda row: row.years > 20)
[===Job Progress===]

>>> names.inspect()
[#]  first_name
===============
[0]  Thurston
[1]  Ruth

Show source ≡

def copy(self, columns=None, where=None):
    """
    New frame with copied columns.

    Parameters
    ----------

    :param columns: (str, List[str], or dictionary(str,str))  If not None, the copy will only include the
                    columns specified.  If dict, the string pairs represent a column renaming
                    { source_column_name : destination_column_name }
    :param where: (UDF) Optionally provide a where function.  If not None, only those rows for which the UDF
                  evaluates to True will be copied.
    :return: (Frame) New Frame object.

    Copies specified columns into a new Frame object, optionally renaming them and/or filtering them.
    Useful for frame query.

    Examples
    --------


    Consider the following frame of employee names, age, and years of service:

        >>> frame.inspect()
        [#]  name      age  years
        =========================
        [0]  Thurston   64     26
        [1]  Judy       44     14
        [2]  Emily      37      5
        [3]  Frank      50     18
        [4]  Joe        43     11
        [5]  Ruth       52     21

        >>> frame.schema
        [('name', ), ('age', ), ('years', )]

    To create a duplicate copy of the frame, use the copy operation with no parameters:

        >>> duplicate = frame.copy()
        [===Job Progress===]

        >>> duplicate.inspect()
        [#]  name      age  years
        =========================
        [0]  Thurston   64     26
        [1]  Judy       44     14
        [2]  Emily      37      5
        [3]  Frank      50     18
        [4]  Joe        43     11
        [5]  Ruth       52     21

    Using the copy operation, we can also limit the new frame to just include the 'name' column:

        >>> names = frame.copy("name")
        [===Job Progress===]

        >>> names.inspect()
        [#]  name
        =============
        [0]  Thurston
        [1]  Judy
        [2]  Emily
        [3]  Frank
        [4]  Joe
        [5]  Ruth

    We could also include a UDF to filter the data that is included in the new frame, and also provide
    a dictionary to rename the column(s) in the new frame.  Here we will use copy to create a frame of
    names for the employees that have over 20 years of service and also rename of the 'name' column to
    'first_name':

        >>> names = frame.copy({"name" : "first_name"}, lambda row: row.years > 20)
        [===Job Progress===]

        >>> names.inspect()
        [#]  first_name
        ===============
        [0]  Thurston
        [1]  Ruth

    """
    new_rdd = self._python.rdd

    if where is not None and not isinstance(where, types.FunctionType):
        raise ValueError("Unsupported type for 'where' parameter.  Must be a function or None, but is: {0}".format(type(where)))

    if isinstance(columns, str):
        columns = [columns]
    if isinstance(columns, list):
        column_indices = [i for i, column in enumerate(self._python.schema) if column[0] in columns]
    elif isinstance(columns, dict):
        column_indices = [i for i, column in enumerate(self._python.schema) if column[0] in columns.keys()]
    elif columns is None:
        column_indices = xrange(0, len(self._python.schema))
    else:
        raise ValueError("Unsupported type for 'columns' parameter. Expected str, list, dict, or None, but was: {0}".format(type(columns)))

    if where is not None:
        # If a udf is provided, apply that function and apply the new schema
        row = Row(self._python.schema)
        def copy_func(r):
            row._set_data(r)
            return where(row)
        new_rdd = self._python.rdd.filter(lambda r: copy_func(r))
    if len(column_indices) < len(self._python.schema):
        # Map rows to only include the specified columns
        row = Row(self._python.schema)
        def map_func(r):
            row._set_data(r)
            return list(row[i] for i in column_indices)
        new_rdd = new_rdd.map(lambda r: map_func(r))

    new_schema = list(self._python.schema[i] for i in column_indices)

    # If columns are being renamed through a dictionary, alter the schema
    if (isinstance(columns, dict)):
        renamed_schema = []
        for column in new_schema:
            if columns.has_key(column[0]):
                new_name = columns[column[0]]
                renamed_schema.append((new_name, column[1]))
        new_schema = renamed_schema

    # return new frame with the filtered rdd and new schema
    return self._tc.frame.create(new_rdd, new_schema)

def correlation(

self, column_a, column_b)

Calculate correlation for two columns of current frame.

Parameters:

column_a

(str):

The name of the column from which to compute the correlation.

column_b

(str):

The name of the column from which to compute the correlation.

Returns

(float):

Pearson correlation coefficient of the two columns.

Notes:

This method applies only to columns containing numerical data.

Examples:

Consider Frame my_frame, which contains the data

>>> my_frame.inspect()
[#]  idnum  x1   x2   x3   x4
===============================
[0]      0  1.0  4.0  0.0  -1.0
[1]      1  2.0  3.0  0.0  -1.0
[2]      2  3.0  2.0  1.0  -1.0
[3]      3  4.0  1.0  2.0  -1.0
[4]      4  5.0  0.0  2.0  -1.0

my_frame.correlation computes the common correlation coefficient (Pearson's) on the pair of columns provided. In this example, the idnum and most of the columns have trivial correlations: -1, 0, or +1. Column x3 provides a contrasting coefficient of 3 / sqrt(3) = 0.948683298051 .

>>> my_frame.correlation("x1", "x2")
-0.9999999999999998

>>> my_frame.correlation("x1", "x4")
nan

>>> my_frame.correlation("x2", "x3")
-0.9486832980505138

Show source ≡

def correlation(self, column_a, column_b):
    """
    Calculate correlation for two columns of current frame.

    Parameters
    ----------

    :param column_a: (str) The name of the column from which to compute the correlation.
    :param column_b: (str) The name of the column from which to compute the correlation.
    :return: (float) Pearson correlation coefficient of the two columns.

    Notes
    -----

    This method applies only to columns containing numerical data.

    Examples
    --------

    Consider Frame *my_frame*, which contains the data


        >>> my_frame.inspect()
        [#]  idnum  x1   x2   x3   x4
        ===============================
        [0]      0  1.0  4.0  0.0  -1.0
        [1]      1  2.0  3.0  0.0  -1.0
        [2]      2  3.0  2.0  1.0  -1.0
        [3]      3  4.0  1.0  2.0  -1.0
        [4]      4  5.0  0.0  2.0  -1.0


    my_frame.correlation computes the common correlation coefficient (Pearson's) on the pair
    of columns provided.
    In this example, the *idnum* and most of the columns have trivial correlations: -1, 0, or +1.
    Column *x3* provides a contrasting coefficient of 3 / sqrt(3) = 0.948683298051 .


        >>> my_frame.correlation("x1", "x2")
        -0.9999999999999998

        >>> my_frame.correlation("x1", "x4")
        nan

        >>> my_frame.correlation("x2", "x3")
        -0.9486832980505138

    """

    return self._scala.correlation(column_a, column_b)

def correlation_matrix(

self, data_column_names)

Calculate correlation matrix for two or more columns.

Parameters:

data_column_names

(List[str]):

The names of the columns from which to compute the matrix.

Returns

(Frame):

A Frame with the matrix of the correlation values for the columns.

Notes:

This method applies only to columns containing numerical data.

Examples:

Consider Frame my_frame, which contains the data

>>> my_frame.inspect()
 [#]  idnum  x1   x2   x3   x4
===============================
[0]      0  1.0  4.0  0.0  -1.0
[1]      1  2.0  3.0  0.0  -1.0
[2]      2  3.0  2.0  1.0  -1.0
[3]      3  4.0  1.0  2.0  -1.0
[4]      4  5.0  0.0  2.0  -1.0

my_frame.correlation_matrix computes the common correlation coefficient (Pearson's) on each pair of columns in the user-provided list. In this example, the idnum and most of the columns have trivial correlations: -1, 0, or +1. Column x3 provides a contrasting coefficient of 3 / sqrt(3) = 0.948683298051

>>> corr_matrix = my_frame.correlation_matrix(my_frame.column_names)
[===Job Progress===]

The resulting table (specifying all columns) is:

>>> corr_matrix.inspect()
[#]  idnum           x1              x2               x3               x4
==========================================================================
[0]             1.0             1.0             -1.0   0.948683298051  nan
[1]             1.0             1.0             -1.0   0.948683298051  nan
[2]            -1.0            -1.0              1.0  -0.948683298051  nan
[3]  0.948683298051  0.948683298051  -0.948683298051              1.0  nan
[4]             nan             nan              nan              nan  1.0

Show source ≡

def correlation_matrix(self, data_column_names):
    """
    Calculate correlation matrix for two or more columns.

    Parameters
    ----------

    :param data_column_names: (List[str]) The names of the columns from which to compute the matrix.
    :return: (Frame) A Frame with the matrix of the correlation values for the columns.

    Notes
    -----
    This method applies only to columns containing numerical data.

    Examples
    --------
    Consider Frame *my_frame*, which contains the data


        >>> my_frame.inspect()
         [#]  idnum  x1   x2   x3   x4
        ===============================
        [0]      0  1.0  4.0  0.0  -1.0
        [1]      1  2.0  3.0  0.0  -1.0
        [2]      2  3.0  2.0  1.0  -1.0
        [3]      3  4.0  1.0  2.0  -1.0
        [4]      4  5.0  0.0  2.0  -1.0


    my_frame.correlation_matrix computes the common correlation coefficient (Pearson's) on each pair
    of columns in the user-provided list.
    In this example, the *idnum* and most of the columns have trivial correlations: -1, 0, or +1.
    Column *x3* provides a contrasting coefficient of 3 / sqrt(3) = 0.948683298051

        >>> corr_matrix = my_frame.correlation_matrix(my_frame.column_names)
        [===Job Progress===]

        The resulting table (specifying all columns) is:

        >>> corr_matrix.inspect()
        [#]  idnum           x1              x2               x3               x4
        ==========================================================================
        [0]             1.0             1.0             -1.0   0.948683298051  nan
        [1]             1.0             1.0             -1.0   0.948683298051  nan
        [2]            -1.0            -1.0              1.0  -0.948683298051  nan
        [3]  0.948683298051  0.948683298051  -0.948683298051              1.0  nan
        [4]             nan             nan              nan              nan  1.0

    """
    from sparktk.frame.frame import Frame
    return Frame(self._tc,
                 self._scala.correlationMatrix(self._tc.jutils.convert.to_scala_list_string(data_column_names)))

def count(

self, where=None)

Counts all rows or all qualified rows.

Parameters:

where

(UDF):

Optional function which evaluates a row to a boolean to determine if it should be counted

Returns

(int):

Number of rows counted

Counts all rows or all rows which meet criteria specified by a UDF predicate.

Examples:

>>> frame = tc.frame.create([['Fred',39,16,'555-1234'],
...                          ['Susan',33,3,'555-0202'],
...                          ['Thurston',65,26,'555-4510'],
...                          ['Judy',44,14,'555-2183']],
...                         schema=[('name', str), ('age', int), ('tenure', int), ('phone', str)])

>>> frame.inspect()
[#]  name      age  tenure  phone
====================================
[0]  Fred       39      16  555-1234
[1]  Susan      33       3  555-0202
[2]  Thurston   65      26  555-4510
[3]  Judy       44      14  555-2183

>>> frame.count()
4

>>> frame.count(lambda row: row.age > 35)
3

Show source ≡

def count(self, where=None):
    """
    Counts all rows or all qualified rows.

    Parameters
    ----------

    :param where: (UDF) Optional function which evaluates a row to a boolean to determine if it should be counted
    :return: (int) Number of rows counted

    Counts all rows or all rows which meet criteria specified by a UDF predicate.

    Examples
    --------

        >>> frame = tc.frame.create([['Fred',39,16,'555-1234'],
        ...                          ['Susan',33,3,'555-0202'],
        ...                          ['Thurston',65,26,'555-4510'],
        ...                          ['Judy',44,14,'555-2183']],
        ...                         schema=[('name', str), ('age', int), ('tenure', int), ('phone', str)])

        >>> frame.inspect()
        [#]  name      age  tenure  phone
        ====================================
        [0]  Fred       39      16  555-1234
        [1]  Susan      33       3  555-0202
        [2]  Thurston   65      26  555-4510
        [3]  Judy       44      14  555-2183

        >>> frame.count()
        4

        >>> frame.count(lambda row: row.age > 35)
        3

    """
    if where:
        row = Row(self.schema)

        def count_where(r):
            row._set_data(r)
            return where(row)

        return self._python.rdd.filter(lambda r: count_where(r)).count()
    else:
        if self._is_scala:
            return int(self._scala.rowCount())
        return self.rdd.count()

def covariance(

self, column_a, column_b)

Calculate covariance for exactly two columns.

Parameters:

column_a

(str):

The name of the column from which to compute the covariance.

column_b

(str):

The name of the column from which to compute the covariance.

Returns

(float):

Covariance of the two columns.

Notes:

This method applies only to columns containing numerical data.

Examples:

Consider Frame my_frame, which contains the data

>>> my_frame.inspect()
[#]  idnum  x1   x2   x3   x4
===============================
[0]      0  1.0  4.0  0.0  -1.0
[1]      1  2.0  3.0  0.0  -1.0
[2]      2  3.0  2.0  1.0  -1.0
[3]      3  4.0  1.0  2.0  -1.0
[4]      4  5.0  0.0  2.0  -1.0

my_frame.covariance computes the covariance on the pair of columns provided.

>>> my_frame.covariance("x1", "x2")
-2.5

>>> my_frame.covariance("x1", "x4")
0.0

>>> my_frame.covariance("x2", "x3")
-1.5

Show source ≡

def covariance(self, column_a, column_b):
    """
    Calculate covariance for exactly two columns.

    Parameters
    ----------

    :param column_a: (str) The name of the column from which to compute the covariance.
    :param column_b: (str) The name of the column from which to compute the covariance.
    :return: (float) Covariance of the two columns.

    Notes
    -----
    This method applies only to columns containing numerical data.

    Examples
    --------
    Consider Frame *my_frame*, which contains the data

        >>> my_frame.inspect()
        [#]  idnum  x1   x2   x3   x4
        ===============================
        [0]      0  1.0  4.0  0.0  -1.0
        [1]      1  2.0  3.0  0.0  -1.0
        [2]      2  3.0  2.0  1.0  -1.0
        [3]      3  4.0  1.0  2.0  -1.0
        [4]      4  5.0  0.0  2.0  -1.0


    my_frame.covariance computes the covariance on the pair of columns provided.

        >>> my_frame.covariance("x1", "x2")
        -2.5

        >>> my_frame.covariance("x1", "x4")
        0.0

        >>> my_frame.covariance("x2", "x3")
        -1.5

    """

    return self._scala.covariance(column_a, column_b)

def covariance_matrix(

self, data_column_names)

Calculate covariance matrix for two or more columns.

Parameters:

data_column_names

(List[str]):

The names of the column from which to compute the matrix. Names should refer to a single column of type vector, or two or more columns of numeric scalars.

Returns

(Frame):

A matrix with the covariance values for the columns.

Notes:

This function applies only to columns containing numerical data.

Examples:

Consider Frame my_frame, which contains the data

>>> my_frame.inspect()
 [#]  idnum  x1   x2   x3   x4
===============================
[0]      0  1.0  4.0  0.0  -1.0
[1]      1  2.0  3.0  0.0  -1.0
[2]      2  3.0  2.0  1.0  -1.0
[3]      3  4.0  1.0  2.0  -1.0
[4]      4  5.0  0.0  2.0  -1.0

my_frame.covariance_matrix computes the covariance on each pair of columns in the user-provided list.

>>> cov_matrix = my_frame.covariance_matrix(my_frame.column_names)
[===Job Progress===]

The resulting table (specifying all columns) is:

>>> cov_matrix.inspect()
[#]  idnum  x1    x2    x3    x4
=================================
[0]    2.5   2.5  -2.5   1.5  0.0
[1]    2.5   2.5  -2.5   1.5  0.0
[2]   -2.5  -2.5   2.5  -1.5  0.0
[3]    1.5   1.5  -1.5   1.0  0.0
[4]    0.0   0.0   0.0   0.0  0.0

Show source ≡

def covariance_matrix(self, data_column_names):
    """
    Calculate covariance matrix for two or more columns.

    Parameters
    ----------

    :param data_column_names: (List[str]) The names of the column from which to compute the matrix.
                              Names should refer to a single column of type vector, or two or more
                              columns of numeric scalars.
    :return: (Frame) A matrix with the covariance values for the columns.

    Notes
    -----
    This function applies only to columns containing numerical data.

    Examples
    --------
    Consider Frame *my_frame*, which contains the data

        >>> my_frame.inspect()
         [#]  idnum  x1   x2   x3   x4
        ===============================
        [0]      0  1.0  4.0  0.0  -1.0
        [1]      1  2.0  3.0  0.0  -1.0
        [2]      2  3.0  2.0  1.0  -1.0
        [3]      3  4.0  1.0  2.0  -1.0
        [4]      4  5.0  0.0  2.0  -1.0


    my_frame.covariance_matrix computes the covariance on each pair of columns in the user-provided list.

        >>> cov_matrix = my_frame.covariance_matrix(my_frame.column_names)
        [===Job Progress===]

        The resulting table (specifying all columns) is:

        >>> cov_matrix.inspect()
        [#]  idnum  x1    x2    x3    x4
        =================================
        [0]    2.5   2.5  -2.5   1.5  0.0
        [1]    2.5   2.5  -2.5   1.5  0.0
        [2]   -2.5  -2.5   2.5  -1.5  0.0
        [3]    1.5   1.5  -1.5   1.0  0.0
        [4]    0.0   0.0   0.0   0.0  0.0

    """
    from sparktk.frame.frame import Frame
    return Frame(self._tc,
                 self._scala.covarianceMatrix(self._tc.jutils.convert.to_scala_list_string(data_column_names)))

def cumulative_percent(

self, sample_col)

Add column to frame with cumulative percent.

Parameters:

sample_col

(str):

The name of the column from which to compute the cumulative percent.

A cumulative percent sum is computed by sequentially stepping through the rows, observing the column values and keeping track of the current percentage of the total sum accounted for at the current value.

Notes:

This method applies only to columns containing numerical data. Although this method will execute for columns containing negative values, the interpretation of the result will change (for example, negative percentages).

Examples:

Consider Frame my_frame accessing a frame that contains a single column named obs:

>>> my_frame.inspect()
[#]  obs
========
[0]    0
[1]    1
[2]    2
[3]    0
[4]    1
[5]    2

The cumulative percent sum for column obs is obtained by:

>>> my_frame.cumulative_percent('obs')
[===Job Progress===]

The Frame my_frame now contains two columns obs and obsCumulativePercentSum. They contain the original data and the cumulative percent sum, respectively:

>>> my_frame.inspect()
[#]  obs  obs_cumulative_percent
================================
[0]    0                     0.0
[1]    1          0.166666666667
[2]    2                     0.5
[3]    0                     0.5
[4]    1          0.666666666667
[5]    2                     1.0

Show source ≡

def cumulative_percent(self, sample_col):
    """
    Add column to frame with cumulative percent.

    Parameters
    ----------

    :param sample_col: (str) The name of the column from which to compute the cumulative percent.

    A cumulative percent sum is computed by sequentially stepping through the rows,
    observing the column values and keeping track of the current percentage of the
    total sum accounted for at the current value.

    Notes
    -----
    This method applies only to columns containing numerical data.
    Although this method will execute for columns containing negative
    values, the interpretation of the result will change (for example,
    negative percentages).

    Examples
    --------
    Consider Frame *my_frame* accessing a frame that contains a single
    column named *obs*:

        >>> my_frame.inspect()
        [#]  obs
        ========
        [0]    0
        [1]    1
        [2]    2
        [3]    0
        [4]    1
        [5]    2

    The cumulative percent sum for column *obs* is obtained by:

        >>> my_frame.cumulative_percent('obs')
        [===Job Progress===]

    The Frame *my_frame* now contains two columns *obs* and
    *obsCumulativePercentSum*.
    They contain the original data and the cumulative percent sum,
    respectively:

        >>> my_frame.inspect()
        [#]  obs  obs_cumulative_percent
        ================================
        [0]    0                     0.0
        [1]    1          0.166666666667
        [2]    2                     0.5
        [3]    0                     0.5
        [4]    1          0.666666666667
        [5]    2                     1.0


    """
    self._scala.cumulativePercent(sample_col)

def cumulative_sum(

self, sample_col)

Add column to frame with cumulative sum.

Parameters:

sample_col

(str):

The name of the column from which to compute the cumulative sum.

A cumulative sum is computed by sequentially stepping through the rows, observing the column values and keeping track of the cumulative sum for each value.

Notes:

This method applies only to columns containing numerical data.

Examples:

Consider Frame my_frame, which accesses a frame that contains a single column named obs:

>>> my_frame.inspect()
[#]  obs
========
[0]    0
[1]    1
[2]    2
[3]    0
[4]    1
[5]    2

The cumulative sum for column obs is obtained by:

>>> my_frame.cumulative_sum('obs')
[===Job Progress===]

The Frame my_frame accesses the original frame that now contains two columns, obs that contains the original column values, and obsCumulativeSum that contains the cumulative percent count:

>>> my_frame.inspect()
[#]  obs  obs_cumulative_sum
============================
[0]    0                 0.0
[1]    1                 1.0
[2]    2                 3.0
[3]    0                 3.0
[4]    1                 4.0
[5]    2                 6.0

Show source ≡

def cumulative_sum(self, sample_col):
    """
    Add column to frame with cumulative sum.

    Parameters
    ----------

    :param sample_col: (str) The name of the column from which to compute the cumulative sum.

    A cumulative sum is computed by sequentially stepping through the rows,
    observing the column values and keeping track of the cumulative sum for each value.

    Notes
    -----
    This method applies only to columns containing numerical data.

    Examples
    --------

    Consider Frame *my_frame*, which accesses a frame that contains a single
    column named *obs*:

        >>> my_frame.inspect()
        [#]  obs
        ========
        [0]    0
        [1]    1
        [2]    2
        [3]    0
        [4]    1
        [5]    2

    The cumulative sum for column *obs* is obtained by:

        >>> my_frame.cumulative_sum('obs')
        [===Job Progress===]

    The Frame *my_frame* accesses the original frame that now contains two
    columns, *obs* that contains the original column values, and
    *obsCumulativeSum* that contains the cumulative percent count:

        >>> my_frame.inspect()
        [#]  obs  obs_cumulative_sum
        ============================
        [0]    0                 0.0
        [1]    1                 1.0
        [2]    2                 3.0
        [3]    0                 3.0
        [4]    1                 4.0
        [5]    2                 6.0

    """
    self._scala.cumulativeSum(sample_col)

def dot_product(

self, left_column_names, right_column_names, dot_product_column_name, default_left_values=None, default_right_values=None)

Calculate dot product for each row in current frame.

Parameters:

left_column_names

(List[str]):

Names of columns used to create the left vector (A) for each row. Names should refer to a single column of type vector, or two or more columns of numeric scalars.

right_column_names

(List[str]):

Names of columns used to create right vector (B) for each row. Names should refer to a single column of type vector, or two or more columns of numeric scalars.

dot_product_column_name

(str):

Name of column used to store the dot product.

default_left_values

(Optional[List[float]):

Default values used to substitute null values in left vector.Default is None.

default_right_values

(Optional[List[float]):

Default values used to substitute null values in right vector.Default is None.

Returns

(Frame):

returns a frame with give "dot_product" column name

Calculate the dot product for each row in a frame using values from two equal-length sequences of columns.

Dot product is computed by the following formula:

The dot product of two vectors :math:A=[a_1, a_2, ..., a_n] and :math:B =[b_1, b_2, ..., b_n] is :math:a_1*b_1 + a_2*b_2 + ...+ a_n*b_n. The dot product for each row is stored in a new column in the existing frame.

Notes:

If default_left_values or default_right_values are not specified, any null values will be replaced by zeros.
This method applies only to columns containing numerical data.

Examples:

>>> data = [[1, 0.2, -2, 5], [2, 0.4, -1, 6], [3, 0.6, 0, 7], [4, 0.8, 1, 8]]
>>> schema = [('col_0', int), ('col_1', float),('col_2', int) ,('col_3', int)]

>>> my_frame = tc.frame.create(data, schema)
[===Job Progress===]

Calculate the dot product for a sequence of columns in Frame object my_frame:

>>> my_frame.inspect()
[#]  col_0  col_1  col_2  col_3
===============================
[0]      1    0.2     -2      5
[1]      2    0.4     -1      6
[2]      3    0.6      0      7
[3]      4    0.8      1      8

Modify the frame by computing the dot product for a sequence of columns:

 >>> my_frame.dot_product(['col_0','col_1'], ['col_2', 'col_3'], 'dot_product')
 [===Job Progress===]

>>> my_frame.inspect()
[#]  col_0  col_1  col_2  col_3  dot_product
============================================
[0]      1    0.2     -2      5         -1.0
[1]      2    0.4     -1      6          0.4
[2]      3    0.6      0      7          4.2
[3]      4    0.8      1      8         10.4

Show source ≡

def dot_product(self, left_column_names,right_column_names,dot_product_column_name,default_left_values=None,default_right_values=None):
    """
    Calculate dot product for each row in current frame.

    Parameters
    ----------

    :param left_column_names: (List[str]) Names of columns used to create the left vector (A) for each row.
                                Names should refer to a single column of type vector, or two or more columns of numeric scalars.
    :param right_column_names: (List[str]) Names of columns used to create right vector (B) for each row.
                               Names should refer to a single column of type vector, or two or more columns of numeric scalars.
    :param dot_product_column_name: (str) Name of column used to store the dot product.
    :param default_left_values: (Optional[List[float]) Default values used to substitute null values in left vector.Default is None.
    :param default_right_values: (Optional[List[float]) Default values used to substitute null values in right vector.Default is None.

    :return: (Frame) returns a frame with give "dot_product" column name

    Calculate the dot product for each row in a frame using values from two equal-length sequences of columns.

    Dot product is computed by the following formula:

    The dot product of two vectors :math:`A=[a_1, a_2, ..., a_n]` and :math:`B =[b_1, b_2, ..., b_n]` is :math:`a_1*b_1 + a_2*b_2 + ...+ a_n*b_n`.
    The dot product for each row is stored in a new column in the existing frame.

    Notes
    -----

    * If default_left_values or default_right_values are not specified, any null values will be replaced by zeros.
    * This method applies only to columns containing numerical data.


    Examples
    --------

        >>> data = [[1, 0.2, -2, 5], [2, 0.4, -1, 6], [3, 0.6, 0, 7], [4, 0.8, 1, 8]]
        >>> schema = [('col_0', int), ('col_1', float),('col_2', int) ,('col_3', int)]

        >>> my_frame = tc.frame.create(data, schema)
        [===Job Progress===]

    Calculate the dot product for a sequence of columns in Frame object *my_frame*:

        >>> my_frame.inspect()
        [#]  col_0  col_1  col_2  col_3
        ===============================
        [0]      1    0.2     -2      5
        [1]      2    0.4     -1      6
        [2]      3    0.6      0      7
        [3]      4    0.8      1      8


    Modify the frame by computing the dot product for a sequence of columns:

         >>> my_frame.dot_product(['col_0','col_1'], ['col_2', 'col_3'], 'dot_product')
         [===Job Progress===]

        >>> my_frame.inspect()
        [#]  col_0  col_1  col_2  col_3  dot_product
        ============================================
        [0]      1    0.2     -2      5         -1.0
        [1]      2    0.4     -1      6          0.4
        [2]      3    0.6      0      7          4.2
        [3]      4    0.8      1      8         10.4

    """

    if not isinstance(left_column_names, list):
        left_column_names = [left_column_names]
    if not isinstance(right_column_names, list):
        right_column_names = [right_column_names]
    self._scala.dotProduct(self._tc.jutils.convert.to_scala_list_string(left_column_names),
                           self._tc.jutils.convert.to_scala_list_string(right_column_names),
                           dot_product_column_name,
                           self._tc.jutils.convert.to_scala_option_list_double(default_left_values),
                           self._tc.jutils.convert.to_scala_option_list_double(default_right_values))

def drop_columns(

self, columns)

Drops columns from the frame

Parameters:

columns

(str or List[str]):

names of the columns to drop

Examples:

For this example, the Frame object my_frame accesses a frame with 4 columns columns column_a, column_b, column_c and column_d and drops 2 columns column_b and column_d using drop columns.

>>> print frame.schema
[('column_a', <type 'str'>), ('column_b', <type 'int'>), ('column_c', <type 'str'>), ('column_d', <type 'int'>)]

Eliminate columns column_b and column_d:

>>> frame.drop_columns(["column_b", "column_d"])
>>> print frame.schema
[('column_a', <type 'str'>), ('column_c', <type 'str'>)]

Now the frame only has the columns column_a and column_c. For further examples, see: ref:example_frame.drop_columns.

Show source ≡

def drop_columns(self, columns):
    """
    Drops columns from the frame

    Parameters
    ----------

    :param columns: (str or List[str]) names of the columns to drop

    Examples
    --------

    For this example, the Frame object *my_frame* accesses a frame with 4 columns
    columns *column_a*, *column_b*, *column_c* and *column_d* and drops 2 columns *column_b* and *column_d* using drop columns.



        >>> print frame.schema
        [('column_a', ), ('column_b', ), ('column_c', ), ('column_d', )]


    Eliminate columns *column_b* and *column_d*:

        >>> frame.drop_columns(["column_b", "column_d"])
        >>> print frame.schema
        [('column_a', ), ('column_c', )]

    Now the frame only has the columns *column_a* and *column_c*.
    For further examples, see: ref:`example_frame.drop_columns`.

    """
    if isinstance(columns, basestring):
        columns = [columns]
    if self._is_scala:
        self._scala.dropColumns(self._tc.jutils.convert.to_scala_vector_string(columns))
    else:
        victim_indices = get_indices_for_selected_columns(self.schema, columns)
        survivor_indices = [i for i in xrange(len(self.schema)) if i not in victim_indices]
        filtered_schema = [self.schema[i] for i in survivor_indices]

        def filter_fields(row):
            return [row[i] for i in survivor_indices]
        filtered_rdd = self.rdd.map(filter_fields)
        self._frame = PythonFrame(filtered_rdd, filtered_schema)

def drop_duplicates(

self, unique_columns=None)

Modify the current frame, removing duplicate rows.

Parameters:

unique_columns

(Optional[List[str] or str]):

Column name(s) to identify duplicates. Default is the entire row is compared.

Remove data rows which are the same as other rows. The entire row can be checked for duplication, or the search for duplicates can be limited to one or more columns. This modifies the current frame.

Examples:

Given a frame with data:

>>> frame.inspect()
[#]  a    b  c
===============
[0]  200  4  25
[1]  200  5  25
[2]  200  4  25
[3]  200  5  35
[4]  200  6  25
[5]  200  8  35
[6]  200  4  45
[7]  200  4  25
[8]  200  5  25
[9]  201  4  25

Remove any rows that are identical to a previous row. The result is a frame of unique rows. Note that row order may change.

>>> frame.drop_duplicates()
[===Job Progress===]

>>> frame.inspect()
[#]  a    b  c
===============
[0]  200  8  35
[1]  200  6  25
[2]  200  5  35
[3]  200  4  45
[4]  200  4  25
[5]  200  5  25
[6]  201  4  25

Now remove any rows that have the same data in columns a and c as a previously checked row:

>>> frame.drop_duplicates([ "a", "c"])
[===Job Progress===]

The result is a frame with unique values for the combination of columns a and c.

>>> frame.inspect()
[#]  a    b  c
===============
[0]  201  4  25
[1]  200  4  45
[2]  200  6  25
[3]  200  8  35

Show source ≡

def drop_duplicates(self, unique_columns=None):
    """
    Modify the current frame, removing duplicate rows.

    Parameters
    ----------

    :param unique_columns: (Optional[List[str] or str]) Column name(s) to identify duplicates. Default is the entire
                           row is compared.

    Remove data rows which are the same as other rows.
    The entire row can be checked for duplication, or the search for duplicates can be limited to one or more columns.
    This modifies the current frame.

    Examples
    --------

    Given a frame with data:


        >>> frame.inspect()
        [#]  a    b  c
        ===============
        [0]  200  4  25
        [1]  200  5  25
        [2]  200  4  25
        [3]  200  5  35
        [4]  200  6  25
        [5]  200  8  35
        [6]  200  4  45
        [7]  200  4  25
        [8]  200  5  25
        [9]  201  4  25

    Remove any rows that are identical to a previous row.
    The result is a frame of unique rows.
    Note that row order may change.

        >>> frame.drop_duplicates()
        [===Job Progress===]

        >>> frame.inspect()
        [#]  a    b  c
        ===============
        [0]  200  8  35
        [1]  200  6  25
        [2]  200  5  35
        [3]  200  4  45
        [4]  200  4  25
        [5]  200  5  25
        [6]  201  4  25

    Now remove any rows that have the same data in columns *a* and
    *c* as a previously checked row:

        >>> frame.drop_duplicates([ "a", "c"])
        [===Job Progress===]

    The result is a frame with unique values for the combination of columns *a*
    and *c*.

        >>> frame.inspect()
        [#]  a    b  c
        ===============
        [0]  201  4  25
        [1]  200  4  45
        [2]  200  6  25
        [3]  200  8  35

    """
    if unique_columns is not None and not isinstance(unique_columns, list):
        unique_columns = [unique_columns]
    if isinstance(unique_columns, list):
        unique_columns = self._tc.jutils.convert.to_scala_vector_string(unique_columns)
    self._scala.dropDuplicates(self._tc.jutils.convert.to_scala_option(unique_columns))

def drop_rows(

self, predicate)

Erase any row in the current frame which qualifies.

Parameters:

predicate

(UDF):

Function which evaluates a row to a boolean; rows that answer True are dropped from the frame.

Examples:

>>> frame = tc.frame.create([['Fred',39,16,'555-1234'],
...                          ['Susan',33,3,'555-0202'],
...                          ['Thurston',65,26,'555-4510'],
...                          ['Judy',44,14,'555-2183']],
...                         schema=[('name', str), ('age', int), ('tenure', int), ('phone', str)])

>>> frame.inspect()
[#]  name      age  tenure  phone
====================================
[0]  Fred       39      16  555-1234
[1]  Susan      33       3  555-0202
[2]  Thurston   65      26  555-4510
[3]  Judy       44      14  555-2183

>>> frame.drop_rows(lambda row: row.name[-1] == 'n')  # drop people whose name ends in 'n'

>>> frame.inspect()
[#]  name  age  tenure  phone
================================
[0]  Fred   39      16  555-1234
[1]  Judy   44      14  555-2183

More information on a |UDF| can be found at :doc:/ds_apir.

Show source ≡

def drop_rows(self, predicate):
    """
    Erase any row in the current frame which qualifies.

    Parameters
    ----------
    :param predicate: (UDF) Function which evaluates a row to a boolean; rows that answer True are dropped from
                      the frame.

    Examples
    --------

        >>> frame = tc.frame.create([['Fred',39,16,'555-1234'],
        ...                          ['Susan',33,3,'555-0202'],
        ...                          ['Thurston',65,26,'555-4510'],
        ...                          ['Judy',44,14,'555-2183']],
        ...                         schema=[('name', str), ('age', int), ('tenure', int), ('phone', str)])

        >>> frame.inspect()
        [#]  name      age  tenure  phone
        ====================================
        [0]  Fred       39      16  555-1234
        [1]  Susan      33       3  555-0202
        [2]  Thurston   65      26  555-4510
        [3]  Judy       44      14  555-2183

        >>> frame.drop_rows(lambda row: row.name[-1] == 'n')  # drop people whose name ends in 'n'

        >>> frame.inspect()
        [#]  name  age  tenure  phone
        ================================
        [0]  Fred   39      16  555-1234
        [1]  Judy   44      14  555-2183

    More information on a |UDF| can be found at :doc:`/ds_apir`.
    """
    row = Row(self.schema)

    def drop_rows_func(r):
        row._set_data(r)
        return not predicate(row)
    self._python.rdd = self._python.rdd.filter(drop_rows_func)

def ecdf(

self, column)

Builds new frame with columns for data and distribution.

Parameters:

column

(str):

The name of the input column containing sample.

Returns

(Frame):

A new Frame containing each distinct value in the sample and its corresponding ECDF value.

Generates the :term:empirical cumulative distribution for the input column.

Examples:

Consider the following sample data set in frame 'frame' containing several numbers.

>>> frame.inspect()
[#]  numbers
============
[0]        1
[1]        3
[2]        1
[3]        0
[4]        2
[5]        1
[6]        4
[7]        3

>>> ecdf_frame = frame.ecdf('numbers')
[===Job Progress===]

>>> ecdf_frame.inspect()
[#]  numbers  numbers_ecdf
==========================
[0]        0         0.125
[1]        1           0.5
[2]        2         0.625
[3]        3         0.875
[4]        4           1.0

Show source ≡

def ecdf(self, column):
    """
    Builds new frame with columns for data and distribution.

    Parameters
    ----------

    :param column: (str) The name of the input column containing sample.
    :return: (Frame) A new Frame containing each distinct value in the sample and its corresponding ECDF value.

    Generates the :term:`empirical cumulative distribution` for the input column.

    Examples
    --------

    Consider the following sample data set in *frame* 'frame' containing several numbers.


        >>> frame.inspect()
        [#]  numbers
        ============
        [0]        1
        [1]        3
        [2]        1
        [3]        0
        [4]        2
        [5]        1
        [6]        4
        [7]        3

        >>> ecdf_frame = frame.ecdf('numbers')
        [===Job Progress===]

        >>> ecdf_frame.inspect()
        [#]  numbers  numbers_ecdf
        ==========================
        [0]        0         0.125
        [1]        1           0.5
        [2]        2         0.625
        [3]        3         0.875
        [4]        4           1.0

    """
    from sparktk.frame.frame import Frame
    return Frame(self._tc, self._scala.ecdf(column))

def entropy(

self, data_column, weights_column=None)

Calculate the Shannon entropy of a column.

Parameters:

data_column

(str):

The column whose entropy is to be calculated.

weights_column

(Optional[str]):

The column that provides weights (frequencies) for the entropy calculation. Must contain numerical data. Default is using uniform weights of 1 for all items.

Returns

(float):

Entropy.

The data column is weighted via the weights column. All data elements of weight <= 0 are excluded from the calculation, as are all data elements whose weight is NaN or infinite. If there are no data elements with a finite weight greater than 0, the entropy is zero.

Examples:

Consider the following sample data set in frame 'frame' containing several numbers.

>>> frame.inspect()
[#]  data  weight
=================
[0]     0       1
[1]     1       2
[2]     2       4
[3]     4       8
>>> entropy = frame.entropy("data", "weight")
[===Job Progress===]

>>> "%0.8f" % entropy
'1.13691659'

If we have more choices and weights, the computation is not as simple. An on-line search for "Shannon Entropy" will provide more detail.

Given a frame of coin flips, half heads and half tails, the entropy is simply ln(2):

>>> frame.inspect()
[#]  data
=========
[0]  H
[1]  T
[2]  H
[3]  T
[4]  H
[5]  T
[6]  H
[7]  T
[8]  H
[9]  T

>>> entropy = frame.entropy("data")
[===Job Progress===]

>>> "%0.8f" % entropy
'0.69314718'

Show source ≡

def entropy(self, data_column, weights_column=None):
    """
    Calculate the Shannon entropy of a column.

    Parameters
    ----------

    :param data_column: (str) The column whose entropy is to be calculated.
    :param weights_column: (Optional[str]) The column that provides weights (frequencies) for the entropy calculation.
                           Must contain numerical data. Default is using uniform weights of 1 for all items.
    :return: (float) Entropy.

    The data column is weighted via the weights column.
    All data elements of weight <= 0 are excluded from the calculation, as are
    all data elements whose weight is NaN or infinite.
    If there are no data elements with a finite weight greater than 0,
    the entropy is zero.

    Examples
    --------

    Consider the following sample data set in *frame* 'frame' containing several numbers.


        >>> frame.inspect()
        [#]  data  weight
        =================
        [0]     0       1
        [1]     1       2
        [2]     2       4
        [3]     4       8
        >>> entropy = frame.entropy("data", "weight")
        [===Job Progress===]

        >>> "%0.8f" % entropy
        '1.13691659'


    If we have more choices and weights, the computation is not as simple.
    An on-line search for "Shannon Entropy" will provide more detail.


    Given a frame of coin flips, half heads and half tails, the entropy is simply ln(2):

        >>> frame.inspect()
        [#]  data
        =========
        [0]  H
        [1]  T
        [2]  H
        [3]  T
        [4]  H
        [5]  T
        [6]  H
        [7]  T
        [8]  H
        [9]  T

        >>> entropy = frame.entropy("data")
        [===Job Progress===]

        >>> "%0.8f" % entropy
        '0.69314718'

    """
    return self._scala.entropy(data_column, self._tc.jutils.convert.to_scala_option(weights_column))

def export_to_csv(

self, file_name, separator=',')

Write current frame to disk as a CSV file

Parameters:

file_name

(str):

file destination

separator

(str):

string to be used for delimiting the fields

Example:

>>> frame = tc.frame.create([[1, 2, 3], [4, 5, 6]])
>>> frame.export_to_csv("sandbox/export_example.csv")
>>> frame2 = tc.frame.import_csv("sandbox/export_example.csv")
>>> frame2.inspect()
[#]  C0  C1  C2
===============
[0]   1   2   3
[1]   4   5   6

Show source ≡

def export_to_csv(self, file_name, separator=','):
    """
    Write current frame to disk as a CSV file

    Parameters
    ----------

    :param file_name: (str) file destination
    :param separator: (str) string to be used for delimiting the fields

    Example
    -------

        >>> frame = tc.frame.create([[1, 2, 3], [4, 5, 6]])
        >>> frame.export_to_csv("sandbox/export_example.csv")
        >>> frame2 = tc.frame.import_csv("sandbox/export_example.csv")
        >>> frame2.inspect()
        [#]  C0  C1  C2
        ===============
        [0]   1   2   3
        [1]   4   5   6

    """
    self._scala.exportToCsv(file_name, separator)

def export_to_hbase(

self, table_name, key_column_name=None, family_name='familyColumn')

Write current frame to HBase table.

Table must exist in HBase.

Parameters:

table_name

(str):

The name of the HBase table that will contain the exported frame

key_column_name

(Optional[str]):

The name of the column to be used as row key in hbase table

family_name

(Optional[str]):

The family name of the HBase table that will contain the exported frame

Example:

>>> data = [[1, 0.2, -2, 5], [2, 0.4, -1, 6], [3, 0.6, 0, 7], [4, 0.8, 1, 8]]
>>> schema = [('a', int), ('b', float),('c', int) ,('d', int)]
>>> my_frame = tc.frame.create(data, schema)
>>> my_frame.export_to_hbase("test_demo_hbase", family_name="test_family")
[===Job Progress===]

Verify exported frame in hbase

From bash shell

$hbase shell

hbase(main):001:0> list

You should see test_demo_hbase table.

Run hbase(main):001:0> scan 'test_demo_hbase' (to verify frame).

Output:

ROW     COLUMN+CELL
 0      column=test_family:a, timestamp=1464219662295, value=1
 0      column=test_family:b, timestamp=1464219662295, value=0.2
 0      column=test_family:c, timestamp=1464219662295, value=-2
 0      column=test_family:d, timestamp=1464219662295, value=5
 1      column=test_family:a, timestamp=1464219662295, value=2
 1      column=test_family:b, timestamp=1464219662295, value=0.4
 1      column=test_family:c, timestamp=1464219662295, value=-1
 1      column=test_family:d, timestamp=1464219662295, value=6
 2      column=test_family:a, timestamp=1464219662295, value=3
 2      column=test_family:b, timestamp=1464219662295, value=0.6
 2      column=test_family:c, timestamp=1464219662295, value=0
 2      column=test_family:d, timestamp=1464219662295, value=7
 3      column=test_family:a, timestamp=1464219662295, value=4
 3      column=test_family:b, timestamp=1464219662295, value=0.8
 3      column=test_family:c, timestamp=1464219662295, value=1
 3      column=test_family:d, timestamp=1464219662295, value=8
4 row(s) in 0.1560 seconds

Show source ≡

def export_to_hbase(self, table_name, key_column_name=None, family_name="familyColumn"):
    """
    Write current frame to HBase table.

    Table must exist in HBase.

    Parameters
    ----------

    :param table_name: (str) The name of the HBase table that will contain the exported frame
    :param key_column_name: (Optional[str]) The name of the column to be used as row key in hbase table
    :param family_name: (Optional[str]) The family name of the HBase table that will contain the exported frame

    Example
    -------

        >>> data = [[1, 0.2, -2, 5], [2, 0.4, -1, 6], [3, 0.6, 0, 7], [4, 0.8, 1, 8]]
        >>> schema = [('a', int), ('b', float),('c', int) ,('d', int)]
        >>> my_frame = tc.frame.create(data, schema)
        >>> my_frame.export_to_hbase("test_demo_hbase", family_name="test_family")
        [===Job Progress===]

    Verify exported frame in hbase

    From bash shell

        $hbase shell

        hbase(main):001:0> list

    You should see test_demo_hbase table.

    Run hbase(main):001:0> scan 'test_demo_hbase' (to verify frame).

    Output:

        ROW     COLUMN+CELL
         0      column=test_family:a, timestamp=1464219662295, value=1
         0      column=test_family:b, timestamp=1464219662295, value=0.2
         0      column=test_family:c, timestamp=1464219662295, value=-2
         0      column=test_family:d, timestamp=1464219662295, value=5
         1      column=test_family:a, timestamp=1464219662295, value=2
         1      column=test_family:b, timestamp=1464219662295, value=0.4
         1      column=test_family:c, timestamp=1464219662295, value=-1
         1      column=test_family:d, timestamp=1464219662295, value=6
         2      column=test_family:a, timestamp=1464219662295, value=3
         2      column=test_family:b, timestamp=1464219662295, value=0.6
         2      column=test_family:c, timestamp=1464219662295, value=0
         2      column=test_family:d, timestamp=1464219662295, value=7
         3      column=test_family:a, timestamp=1464219662295, value=4
         3      column=test_family:b, timestamp=1464219662295, value=0.8
         3      column=test_family:c, timestamp=1464219662295, value=1
         3      column=test_family:d, timestamp=1464219662295, value=8
        4 row(s) in 0.1560 seconds
    """
    if not isinstance(table_name, basestring):
        raise ValueError("Unsupported 'table_name' parameter type.  Expected string, but found %s." % type(table_name))

    if not isinstance(family_name, basestring):
        raise ValueError(
            "Unsupported 'family_name' parameter type.  Expected string, but found %s." % type(family_name))

    self._scala.exportToHbase(table_name, self._tc.jutils.convert.to_scala_option(key_column_name), family_name)

def export_to_hive(

self, hive_table_name)

Write current frame to Hive table.

Table must not exist in Hive. Hive does not support case sensitive table names and columns names. Hence column names with uppercase letters will be converted to lower case by Hive.

Parameters:

hive_table_name

(str):

hive table name

Example:

>>> data = [[1, 0.2, -2, 5], [2, 0.4, -1, 6], [3, 0.6, 0, 7], [4, 0.8, 1, 8]]
>>> schema = [('a', int), ('b', float),('c', int) ,('d', int)]
>>> my_frame = tc.frame.create(data, schema)
[===Job Progress===]

table_name: (string): table name. It will create new table with given name if it does not exists already.

>>> my_frame.export_to_hive("demo_test_hive")
[===Job Progress===]

Verify exported frame in hive

From bash shell

$hive
hive> show tables

You should see demo_test_hive table.

Run hive> select * from demo_test_hive; (to verify frame).

Show source ≡

def export_to_hive(self, hive_table_name):
    """
    Write current frame to Hive table.

    Table must not exist in Hive. Hive does not support case sensitive table names and columns names.
    Hence column names with uppercase letters will be converted to lower case by Hive.

    Parameters
    ----------

    :param hive_table_name: (str) hive table name

    Example
    --------
        >>> data = [[1, 0.2, -2, 5], [2, 0.4, -1, 6], [3, 0.6, 0, 7], [4, 0.8, 1, 8]]
        >>> schema = [('a', int), ('b', float),('c', int) ,('d', int)]
        >>> my_frame = tc.frame.create(data, schema)
        [===Job Progress===]


    table_name: (string): table name. It will create new table with given name if it does not exists already.

        >>> my_frame.export_to_hive("demo_test_hive")
        [===Job Progress===]


    Verify exported frame in hive

    From bash shell

        $hive
        hive> show tables

    You should see demo_test_hive table.

    Run hive> select * from demo_test_hive; (to verify frame).

    """
    self._scala.exportToHive(hive_table_name)

def export_to_jdbc(

self, connection_url, table_name)

Write current frame to JDBC table

Parameters:

connection_url

(str):

JDBC connection url to database server

table_name

(str):

JDBC table name

Example:

>>> from sparktk import TkContext
>>> c=TkContext(sc)
>>> data = [[1, 0.2, -2, 5], [2, 0.4, -1, 6], [3, 0.6, 0, 7], [4, 0.8, 1, 8]]
>>> schema = [('a', int), ('b', float),('c', int) ,('d', int)]
>>> my_frame = tc.frame.create(data, schema)
[===Job Progress===]

connection_url : (string) : "jdbc:{datasbase_type}://{host}/{database_name}

Sample connection string for postgres ex: jdbc:postgresql://localhost/postgres [standard connection string to connect to default 'postgres' database]

table_name: (string): table name. It will create new table with given name if it does not exists already.

>>> my_frame.export_to_jdbc("jdbc:postgresql://localhost/postgres", "demo_test")
[===Job Progress===]

Verify exported frame in postgres

From bash shell

$sudo -su ppostgres psql
postgres=#\d

You should see demo_test table.

Run postgres=#select * from demo_test (to verify frame).

Show source ≡

def export_to_jdbc(self, connection_url, table_name):
    """
    Write current frame to JDBC table

    Parameters
    ----------

    :param connection_url: (str) JDBC connection url to database server
    :param table_name: (str) JDBC table name

    Example
    -------


        >>> from sparktk import TkContext
        >>> c=TkContext(sc)
        >>> data = [[1, 0.2, -2, 5], [2, 0.4, -1, 6], [3, 0.6, 0, 7], [4, 0.8, 1, 8]]
        >>> schema = [('a', int), ('b', float),('c', int) ,('d', int)]
        >>> my_frame = tc.frame.create(data, schema)
        [===Job Progress===]

    connection_url : (string) : "jdbc:{datasbase_type}://{host}/{database_name}

    Sample connection string for postgres
    ex: jdbc:postgresql://localhost/postgres [standard connection string to connect to default 'postgres' database]

    table_name: (string): table name. It will create new table with given name if it does not exists already.

        >>> my_frame.export_to_jdbc("jdbc:postgresql://localhost/postgres", "demo_test")
        [===Job Progress===]

    Verify exported frame in postgres

        From bash shell

        $sudo -su ppostgres psql
        postgres=#\d

    You should see demo_test table.

    Run postgres=#select * from demo_test (to verify frame).
    """
    self._scala.exportToJdbc(connection_url, table_name)

def export_to_json(

self, path, count=0, offset=0)

Write current frame to HDFS in Json format.

Parameters:

path

(str):

The HDFS folder path where the files will be created.

count

(Optional[int]):

The number of records you want. Default (0), or a non-positive value, is the whole frame.

offset

(Optional[int]):

The number of rows to skip before exporting to the file. Default is zero (0).

Show source ≡

def export_to_json(self, path, count=0, offset=0):
    """
    Write current frame to HDFS in Json format.

    Parameters
    ----------

    :param path: (str) The HDFS folder path where the files will be created.
    :param count: (Optional[int]) The number of records you want. Default (0), or a non-positive value, is the
                   whole frame.
    :param offset: (Optional[int]) The number of rows to skip before exporting to the file. Default is zero (0).

    """
    self._scala.exportToJson(path, count, offset)

def filter(

self, predicate)

Select all rows which satisfy a predicate.

Modifies the current frame to save defined rows and delete everything else.

Parameters:

predicate

(UDF):

Function which evaluates a row to a boolean; rows that answer False are dropped from the frame.

Examples:

>>> frame = tc.frame.create([['Fred',39,16,'555-1234'],
...                          ['Susan',33,3,'555-0202'],
...                          ['Thurston',65,26,'555-4510'],
...                          ['Judy',44,14,'555-2183']],
...                         schema=[('name', str), ('age', int), ('tenure', int), ('phone', str)])

>>> frame.inspect()
[#]  name      age  tenure  phone
====================================
[0]  Fred       39      16  555-1234
[1]  Susan      33       3  555-0202
[2]  Thurston   65      26  555-4510
[3]  Judy       44      14  555-2183

>>> frame.filter(lambda row: row.tenure >= 15)  # keep only people with 15 or more years tenure

>>> frame.inspect()
[#]  name      age  tenure  phone
====================================
[0]  Fred       39      16  555-1234
[1]  Thurston   65      26  555-4510

More information on a |UDF| can be found at :doc:/ds_apir.

Show source ≡

def filter(self, predicate):
    """
    Select all rows which satisfy a predicate.

    Modifies the current frame to save defined rows and delete everything
    else.

    Parameters
    ----------

    :param predicate: (UDF) Function which evaluates a row to a boolean; rows that answer False are dropped
                      from the frame.

    Examples
    --------
        >>> frame = tc.frame.create([['Fred',39,16,'555-1234'],
        ...                          ['Susan',33,3,'555-0202'],
        ...                          ['Thurston',65,26,'555-4510'],
        ...                          ['Judy',44,14,'555-2183']],
        ...                         schema=[('name', str), ('age', int), ('tenure', int), ('phone', str)])

        >>> frame.inspect()
        [#]  name      age  tenure  phone
        ====================================
        [0]  Fred       39      16  555-1234
        [1]  Susan      33       3  555-0202
        [2]  Thurston   65      26  555-4510
        [3]  Judy       44      14  555-2183

        >>> frame.filter(lambda row: row.tenure >= 15)  # keep only people with 15 or more years tenure

        >>> frame.inspect()
        [#]  name      age  tenure  phone
        ====================================
        [0]  Fred       39      16  555-1234
        [1]  Thurston   65      26  555-4510

    More information on a |UDF| can be found at :doc:`/ds_apir`.
    """
    row = Row(self.schema)

    def filter_func(r):
        row._set_data(r)
        return predicate(row)
    self._python.rdd = self._python.rdd.filter(filter_func)

def flatten_columns(

self, columns)

Spread data to multiple rows based on cell data.

Parameters:

columns

(str or tuple(str, str)):

The the name of the column to be flattened, or a tuple with the column name and delimiter string. The default delimiter is a comma (,).

Splits cells in the specified columns into multiple rows according to a string delimiter. New rows are a full copy of the original row, but the specified columns only contain one value. The original row is deleted.

Examples:

Given a data file:

1-solo,mono,single-green,yellow,red
2-duo,double-orange,black

The commands to bring the data into a frame, where it can be worked on:

>>> frame.inspect()
[#]  a  b                 c
==========================================
[0]  1  solo,mono,single  green|yellow|red
[1]  2  duo,double        orange|black

Now, spread out those sub-strings in column b and c by specifying the column names and delmiters:

>>> frame.flatten_columns([('b', ','), ('c', '|')])
[===Job Progress===]

Note that the delimiters parameter is optional, and if no delimiter is specified, the default is a comma (,). So, in the above example, the delimiter parameter for b could be omitted.

Check again:

>>> frame.inspect()
[#]  a  b       c
======================
[0]  1  solo    green
[1]  1  mono    yellow
[2]  1  single  red
[3]  2  duo     orange
[4]  2  double  black

Alternatively, we can flatten a single column b using the default comma delimiter:

>>> frame.flatten_columns('b')
[===Job Progress===]

Check again:

>>> frame.inspect()
[#]  a  b       c
================================
[0]  1  solo    green|yellow|red
[1]  1  mono    green|yellow|red
[2]  1  single  green|yellow|red
[3]  2  duo     orange|black
[4]  2  double  orange|black

Show source ≡

def flatten_columns (self, columns):
    """
    Spread data to multiple rows based on cell data.

    Parameters
    ----------

    :param columns: (str or tuple(str, str)) The the name of the column to be flattened, or a tuple with the column name and
                    delimiter string.  The default delimiter is a comma (,).

    Splits cells in the specified columns into multiple rows according to a string delimiter.
    New rows are a full copy of the original row, but the specified columns only contain one value.
    The original row is deleted.

    Examples
    --------

    Given a data file:

        1-solo,mono,single-green,yellow,red
        2-duo,double-orange,black

    The commands to bring the data into a frame, where it can be worked on:

        >>> frame.inspect()
        [#]  a  b                 c
        ==========================================
        [0]  1  solo,mono,single  green|yellow|red
        [1]  2  duo,double        orange|black

    Now, spread out those sub-strings in column *b* and *c* by specifying the column names and delmiters:

        >>> frame.flatten_columns([('b', ','), ('c', '|')])
        [===Job Progress===]

    Note that the delimiters parameter is optional, and if no delimiter is specified, the default
    is a comma (,).  So, in the above example, the delimiter parameter for *b* could be omitted.

    Check again:

        >>> frame.inspect()
        [#]  a  b       c
        ======================
        [0]  1  solo    green
        [1]  1  mono    yellow
        [2]  1  single  red
        [3]  2  duo     orange
        [4]  2  double  black


    Alternatively, we can flatten a single column *b* using the default comma delimiter:

        >>> frame.flatten_columns('b')
        [===Job Progress===]

    Check again:

        >>> frame.inspect()
        [#]  a  b       c
        ================================
        [0]  1  solo    green|yellow|red
        [1]  1  mono    green|yellow|red
        [2]  1  single  green|yellow|red
        [3]  2  duo     orange|black
        [4]  2  double  orange|black

    """
    if not isinstance(columns, list):
        columns = [columns]
    columns = [c if isinstance(c, tuple) else (c, None) for c in columns]
    return self._scala.flattenColumns(self._tc.jutils.convert.to_scala_list_string_option_tuple(columns))

def group_by(

self, group_by_columns, *aggregations)

Create a summarized frame with aggregations (Avg, Count, Max, Min, Mean, Sum, Stdev, ...).

Parameters:

group_by_columns

(List[str]):

list of columns to group on

aggregations

(dict):

Aggregation function based on entire row, and/or dictionaries (one or more) of { column name str : aggregation function(s) }.

Returns

(Frame):

Summarized Frame

Creates a new frame and returns a Frame object to access it.Takes a column or group of columns, finds the unique combination of values, and creates unique rows with these column values.The other columns are combined according to the aggregation argument(s).

Aggregation currently supports using the following functions:

    *   avg
    *   count
    *   count_distinct
    *   max
    *   min
    *   stdev
    *   sum
    *   var
    *   histogram()

Notes:

Column order is not guaranteed when columns are added
The column names created by aggregation functions in the new frame are the original column name appended with the '_' character and the aggregation function. For example, if the original field is a and the function is avg, the resultant column is named a_avg.
An aggregation argument of count results in a column named count.
The aggregation function agg.count is the only full row aggregation function supported at this time.

Examples:

Consider this frame:

>>> frame.inspect()
[#]  a  b        c     d       e  f    g
========================================
[0]  1  alpha     3.0  small   1  3.0  9
[1]  1  bravo     5.0  medium  1  4.0  9
[2]  1  alpha     5.0  large   1  8.0  8
[3]  2  bravo     8.0  large   1  5.0  7
[4]  2  charlie  12.0  medium  1  6.0  6
[5]  2  bravo     7.0  small   1  8.0  5
[6]  2  bravo    12.0  large   1  6.0  4

Count the groups in column 'b'

>>> b_count = frame.group_by('b', tc.agg.count)
[===Job Progress===]
>>> b_count.inspect()
[#]  b        count
===================
[0]  alpha        2
[1]  charlie      1
[2]  bravo        4

Group by columns 'a' and 'b' and compute the average for column 'c'

>>> avg1 = frame.group_by(['a', 'b'], {'c' : tc.agg.avg})

>>> avg1.inspect()
[#]  a  b        c_AVG
======================
[0]  2  charlie   12.0
[1]  2  bravo      9.0
[2]  1  bravo      5.0
[3]  1  alpha      4.0

Group by column 'a' and make a bunch of calculations for the grouped columns 'f' and 'g'

>>> mix_frame = frame.group_by('a', tc.agg.count, {'f': [tc.agg.avg, tc.agg.sum, tc.agg.min], 'g': tc.agg.max})

>>> mix_frame.inspect()
[#]  a  count  g_MAX  f_AVG  f_SUM  f_MIN
=========================================
[0]  2      4      7   6.25   25.0    5.0
[1]  1      3      9    5.0   15.0    3.0

Group by with histogram. The histogram aggregation argument is configured with these parameters:

cutoffs

(List[int or float or long or double]):

An array of values containing bin cutoff points. Array can be list or tuple. If an array is provided, values must be progressively increasing. All bin boundaries must be included, so, with N bins, you need N+1 values. For example,

cutoffs=[1, 5, 8, 12] # creates three bins:
                      #  bin0 holds values [1 inclusive - 5 exclusive]
                      #  bin1 holds values [5 inclusive - 8 exclusive]
                      #  bin2 holds values [8 inclusive - 9 exclusive]

include_lowest

(Optional[bool]):

Specify how the boundary conditions are handled. ``True`` indicates that the lower bound of the bin is inclusive. ``False`` indicates that the upper bound is inclusive. Default is ``True``.

strict_binning

(Optional(bool)):

Example:

>>> hist = frame.group_by('a', {'g': tc.agg.histogram([1, 5, 8, 9])})

>>> hist.inspect()
[#]  a  g_HISTOGRAM
=========================
[0]  2  [0.25, 0.75, 0.0]
[1]  1    [0.0, 0.0, 1.0]

>>> hist = frame.group_by('a', {'g': tc.agg.histogram([1, 5, 8, 9], False)})

>>> hist.inspect()
[#]  a  g_HISTOGRAM
=============================================
[0]  2                        [0.5, 0.5, 0.0]
[1]  1  [0.0, 0.333333333333, 0.666666666667]

Show source ≡

def group_by(self, group_by_columns, *aggregations):
    """
    Create a summarized frame with aggregations (Avg, Count, Max, Min, Mean, Sum, Stdev, ...).

    Parameters
    ----------

    :param group_by_columns: (List[str]) list of columns to group on
    :param aggregations: (dict) Aggregation function based on entire row, and/or dictionaries (one or more) of { column name str : aggregation function(s) }.
    :return: (Frame) Summarized Frame

    Creates a new frame and returns a Frame object to access it.Takes a column or group of columns, finds the unique combination of
    values, and creates unique rows with these column values.The other columns are combined according to the aggregation argument(s).

    Aggregation currently supports using the following functions:

            *   avg
            *   count
            *   count_distinct
            *   max
            *   min
            *   stdev
            *   sum
            *   var
            *   histogram()



    Notes
    -----
    *   Column order is not guaranteed when columns are added
    *   The column names created by aggregation functions in the new frame are the original column name appended
        with the '_' character and the aggregation function. For example, if the original field is *a* and the
        function is *avg*, the resultant column is named *a_avg*.

    *   An aggregation argument of *count* results in a column named *count*.
    *   The aggregation function *agg.count* is the only full row aggregation function supported at this time.

    Examples
    -------

    Consider this frame:


        >>> frame.inspect()
        [#]  a  b        c     d       e  f    g
        ========================================
        [0]  1  alpha     3.0  small   1  3.0  9
        [1]  1  bravo     5.0  medium  1  4.0  9
        [2]  1  alpha     5.0  large   1  8.0  8
        [3]  2  bravo     8.0  large   1  5.0  7
        [4]  2  charlie  12.0  medium  1  6.0  6
        [5]  2  bravo     7.0  small   1  8.0  5
        [6]  2  bravo    12.0  large   1  6.0  4

    Count the groups in column 'b'

        >>> b_count = frame.group_by('b', tc.agg.count)
        [===Job Progress===]
        >>> b_count.inspect()
        [#]  b        count
        ===================
        [0]  alpha        2
        [1]  charlie      1
        [2]  bravo        4

    Group by columns 'a' and 'b' and compute the average for column 'c'

        >>> avg1 = frame.group_by(['a', 'b'], {'c' : tc.agg.avg})

        >>> avg1.inspect()
        [#]  a  b        c_AVG
        ======================
        [0]  2  charlie   12.0
        [1]  2  bravo      9.0
        [2]  1  bravo      5.0
        [3]  1  alpha      4.0

    Group by column 'a' and make a bunch of calculations for the grouped columns 'f' and 'g'

        >>> mix_frame = frame.group_by('a', tc.agg.count, {'f': [tc.agg.avg, tc.agg.sum, tc.agg.min], 'g': tc.agg.max})

        >>> mix_frame.inspect()
        [#]  a  count  g_MAX  f_AVG  f_SUM  f_MIN
        =========================================
        [0]  2      4      7   6.25   25.0    5.0
        [1]  1      3      9    5.0   15.0    3.0


    **Group by with histogram**.  The histogram aggregation argument is configured with these parameters:

    :param cutoffs: (List[int or float or long or double]) An array of values containing bin cutoff points.
    Array can be list or tuple. If an array is provided, values must be progressively increasing. All bin
    boundaries must be included, so, with N bins, you need N+1 values.  For example,

        cutoffs=[1, 5, 8, 12] # creates three bins:
                              #  bin0 holds values [1 inclusive - 5 exclusive]
                              #  bin1 holds values [5 inclusive - 8 exclusive]
                              #  bin2 holds values [8 inclusive - 9 exclusive]

    :param include_lowest: (Optional[bool]) Specify how the boundary conditions are handled. ``True``
    indicates that the lower bound of the bin is inclusive.  ``False`` indicates that the upper bound is
    inclusive. Default is ``True``.

    :param strict_binning: (Optional(bool)) Specify how values outside of the cutoffs array should be
    binned. If set to ``True``, each value less than cutoffs[0] or greater than cutoffs[-1] will be
    assigned a bin value of -1. If set to ``False``, values less than cutoffs[0] will be included in
    the first bin while values greater than cutoffs[-1] will be included in the final bin.

    Example
    -------

        >>> hist = frame.group_by('a', {'g': tc.agg.histogram([1, 5, 8, 9])})

        >>> hist.inspect()
        [#]  a  g_HISTOGRAM
        =========================
        [0]  2  [0.25, 0.75, 0.0]
        [1]  1    [0.0, 0.0, 1.0]

        >>> hist = frame.group_by('a', {'g': tc.agg.histogram([1, 5, 8, 9], False)})

        >>> hist.inspect()
        [#]  a  g_HISTOGRAM
        =============================================
        [0]  2                        [0.5, 0.5, 0.0]
        [1]  1  [0.0, 0.333333333333, 0.666666666667]

    """
    if group_by_columns is None:
        group_by_columns = []
    elif isinstance(group_by_columns, basestring):
        group_by_columns = [group_by_columns]

    first_column_name = None
    aggregation_list = []  # aggregationFunction : String, columnName : String, newColumnName

    for arg in aggregations:
        if arg == agg.count:
            if not first_column_name:
                # only make this call once, since it goes to http - TODO, ultimately should be handled server-side
                first_column_name = self.column_names[0]
            aggregation_list.append(
                    {'function': agg.count, 'column_name': first_column_name, 'new_column_name': "count"})
        elif isinstance(arg, dict):
            for key, value in arg.iteritems():
                # leave the valid column check to the server
                if isinstance(value, list) or isinstance(value, tuple):
                    for item in value:
                        if item not in agg:
                            raise ValueError(
                                "%s is not a valid aggregation function, like agg.max.  Supported agg methods: %s" % (
                                item, agg))
                        aggregation_list.append(
                                {'function': item, 'column_name': key, 'new_column_name': "%s_%s" % (key, item)})
                else:
                    aggregation_list.append(
                            {'function': value, 'column_name': key, 'new_column_name': "%s_%s" % (key, value)})
        else:
            raise TypeError(
                "Bad type %s provided in aggregation arguments; expecting an aggregation function or a dictionary of column_name:[func]" % type(
                    arg))

    scala_group_by_aggregation_args = []
    for item in aggregation_list:
        scala_group_by_aggregation_args.append(self._tc.jutils.convert.to_scala_group_by_aggregation_args(item))
    from sparktk.frame.frame import Frame
    return Frame(self._tc, self._scala.groupBy(self._tc.jutils.convert.to_scala_list_string(group_by_columns),
                                               self._tc.jutils.convert.to_scala_list(scala_group_by_aggregation_args)))

def histogram(

self, column_name, num_bins=None, weight_column_name=None, bin_type='equalwidth')

Compute the histogram for a column in a frame.

The returned value is a Histogram object containing 3 lists one each for: the cutoff points of the bins, size of each bin, and density of each bin.

Parameters:

column_name

(str):

Name of column to be evaluated.

num_bins

(Optional[int]):

Number of bins in histogram. Default is Square-root choice will be used (in other words math.floor(math.sqrt(frame.count())).

weight_column_name

(Optional[str]):

Name of column containing weights. Default is all observations are weighted equally.

bin_type

(str["equalwidth"|"equaldepth"]):

The type of binning algorithm to use: ["equalwidth"|"equaldepth"] Defaults is "equalwidth".

Returns

(Histogram):

A Histogram object containing the result set. The data returned is composed of multiple components:
cutoffs : array of float
A list containing the edges of each bin.
hist : array of float
A list containing count of the weighted observations found in each bin.
density : array of float
A list containing a decimal containing the percentage of observations found in the total set per bin.

Notes:

The num_bins parameter is considered to be the maximum permissible number of bins because the data may dictate fewer bins. With equal depth binning, for example, if the column to be binned has 10 elements with only 2 distinct values and the num_bins parameter is greater than 2, then the number of actual number of bins will only be 2. This is due to a restriction that elements with an identical value must belong to the same bin.

Examples:

Consider the following sample data set:

>>> frame.inspect()
    [#]  a  b
    =========
    [0]  a  2
    [1]  b  7
    [2]  c  3
    [3]  d  9
    [4]  e  1

A simple call for 3 equal-width bins gives:

>>> hist = frame.histogram("b", num_bins=3)

>>> hist.cutoffs
[1.0, 3.6666666666666665, 6.333333333333333, 9.0]

>>> hist.hist
[3.0, 0.0, 2.0]

>>> hist.density
[0.6, 0.0, 0.4]

Switching to equal depth gives:

>>> hist = frame.histogram("b", num_bins=3, bin_type='equaldepth')

>>> hist.cutoffs
[1.0, 2.0, 7.0, 9.0]

>>> hist.hist
[1.0, 2.0, 2.0]

>>> hist.density
[0.2, 0.4, 0.4]

Plot hist as a bar chart using matplotlib:

>>> import matplotlib.pyplot as plt

>>> plt.bar(hist,cutoffs[:1], hist.hist, width=hist.cutoffs[1] - hist.cutoffs[0])

Plot hist as a bar chart using matplotlib:

>>> import matplotlib.pyplot as plt

>>> plt.bar(hist.cutoffs[:1], hist.hist, width=hist.cutoffs[1] -
... hist["cutoffs"][0])

Show source ≡

def histogram(self, column_name, num_bins=None, weight_column_name=None, bin_type="equalwidth"):
    """
    Compute the histogram for a column in a frame.

    The returned value is a Histogram object containing 3 lists one each for:
    the cutoff points of the bins, size of each bin, and density of each bin.

    Parameters
    ----------

    :param column_name: (str) Name of column to be evaluated.
    :param num_bins: (Optional[int]) Number of bins in histogram.
                     Default is Square-root choice will be used
                     (in other words math.floor(math.sqrt(frame.count())).
    :param weight_column_name: (Optional[str]) Name of column containing weights.
                               Default is all observations are weighted equally.
    :param bin_type: (str["equalwidth"|"equaldepth"]) The type of binning algorithm to use:
                     ["equalwidth"|"equaldepth"] Defaults is "equalwidth".
    :return: (Histogram) A Histogram object containing the result set.
                The data returned is composed of multiple components:

            cutoffs : array of float

                A list containing the edges of each bin.

            hist : array of float

                A list containing count of the weighted observations found in each bin.

            density : array of float

                A list containing a decimal containing the percentage of
                observations found in the total set per bin.

    Notes
    -----

    The num_bins parameter is considered to be the maximum permissible number
    of bins because the data may dictate fewer bins.
    With equal depth binning, for example, if the column to be binned has 10
    elements with only 2 distinct values and the *num_bins* parameter is
    greater than 2, then the number of actual number of bins will only be 2.
    This is due to a restriction that elements with an identical value must
    belong to the same bin.

    Examples
    --------

    Consider the following sample data set:

        >>> frame.inspect()
            [#]  a  b
            =========
            [0]  a  2
            [1]  b  7
            [2]  c  3
            [3]  d  9
            [4]  e  1

    A simple call for 3 equal-width bins gives:

        >>> hist = frame.histogram("b", num_bins=3)

        >>> hist.cutoffs
        [1.0, 3.6666666666666665, 6.333333333333333, 9.0]

        >>> hist.hist
        [3.0, 0.0, 2.0]

        >>> hist.density
        [0.6, 0.0, 0.4]

    Switching to equal depth gives:

        >>> hist = frame.histogram("b", num_bins=3, bin_type='equaldepth')

        >>> hist.cutoffs
        [1.0, 2.0, 7.0, 9.0]

        >>> hist.hist
        [1.0, 2.0, 2.0]

        >>> hist.density
        [0.2, 0.4, 0.4]

    Plot hist as a bar chart using matplotlib:

        >>> import matplotlib.pyplot as plt

        >>> plt.bar(hist,cutoffs[:1], hist.hist, width=hist.cutoffs[1] - hist.cutoffs[0])

    Plot hist as a bar chart using matplotlib:

        >>> import matplotlib.pyplot as plt

        >>> plt.bar(hist.cutoffs[:1], hist.hist, width=hist.cutoffs[1] -
        ... hist["cutoffs"][0])

    """
    results = self._tc.jutils.convert.scala_map_to_python_with_iterable_values(self._scala.histogram(column_name,
                          self._tc.jutils.convert.to_scala_option(num_bins),
                          self._tc.jutils.convert.to_scala_option(weight_column_name),
                          bin_type))
    return Histogram(**results)

def inspect(

self, n=10, offset=0, columns=None, wrap='inspect_settings', truncate='inspect_settings', round='inspect_settings', width='inspect_settings', margin='inspect_settings', with_types='inspect_settings')

Pretty-print of the frame data

Essentially returns a string, but technically returns a RowInspection object which renders a string. The RowInspection object naturally converts to a str when needed, like when printed or when displayed by python REPL (i.e. using the object's repr). If running in a script and want the inspect output to be printed, then it must be explicitly printed, then print frame.inspect()

Parameters:

(Optional[int]):

The number of rows to print

offset

(Optional[int]):

The number of rows to skip before printing.

columns

(Optional[List[str]]):

Filter columns to be included. By default, all columns are included.

wrap

(Optional[int or 'stripes']):

If set to 'stripes' then inspect prints rows in stripes; if set to an integer N, rows will be printed in clumps of N columns, where the columns are wrapped.

truncate

(Optional[int]):

If set to integer N, all strings will be truncated to length N, including all tagged ellipses.

round

(Optional[int]):

If set to integer N, all floating point numbers will be rounded and truncated to N digits.

width

(Optional[int]):

If set to integer N, the print out will try to honor a max line width of N.

margin

(Optional[int]):

Applies to 'stripes' mode only. If set to integer N, the margin for printing names in a stripe will be limited to N characters.

with_types

(Optinoal[bool]):

If set to True, header will include the data_type of each column.

Returns

(RowsInspection):

An object which naturally converts to a pretty-print string.

Examples:

To look at the first 4 rows of data in a frame:

>>> frame.inspect(4)
[#]  animal    name    age  weight
==================================
[0]  human     George    8   542.5
[1]  human     Ursula    6   495.0
[2]  ape       Ape      41   400.0
[3]  elephant  Shep      5  8630.0

For other examples, see :ref:`example_frame.inspect`.

Note: if the frame data contains unicode characters, this method may raise a Unicode exception when running in an interactive REPL or otherwise which triggers the standard python repr(). To get around this problem, explicitly print the unicode of the returned object:

>>> print unicode(frame.inspect())

Global Settings

If not specified, the arguments that control formatting receive default values from 'sparktk.inspect_settings'. Make changes there to affect all calls to inspect.

>>> import sparktk
>>> sparktk.inspect_settings
wrap             20
truncate       None
round          None
width            80
margin         None
with_types    False
>>> sparktk.inspect_settings.width = 120  # changes inspect to use 120 width globally
>>> sparktk.inspect_settings.truncate = 16  # changes inspect to always truncate strings to 16 chars
>>> sparktk.inspect_settings
wrap             20
truncate         16
round          None
width           120
margin         None
with_types    False
>>> sparktk.inspect_settings.width = None  # return value back to default
>>> sparktk.inspect_settings
wrap             20
truncate         16
round          None
width            80
margin         None
with_types    False
>>> sparktk.inspect_settings.reset()  # set everything back to default
>>> sparktk.inspect_settings
wrap             20
truncate       None
round          None
width            80
margin         None
with_types    False

Show source ≡

def inspect(self,
            n=10,
            offset=0,
            columns=None,
            wrap=inspect_settings._unspecified,
            truncate=inspect_settings._unspecified,
            round=inspect_settings._unspecified,
            width=inspect_settings._unspecified,
            margin=inspect_settings._unspecified,
            with_types=inspect_settings._unspecified):
    """
    Pretty-print of the frame data

    Essentially returns a string, but technically returns a RowInspection object which renders a string.
    The RowInspection object naturally converts to a str when needed, like when printed or when displayed
    by python REPL (i.e. using the object's __repr__).  If running in a script and want the inspect output
    to be printed, then it must be explicitly printed, then `print frame.inspect()`

    Parameters
    ----------
    :param n: (Optional[int]) The number of rows to print
    :param offset: (Optional[int]) The number of rows to skip before printing.
    :param columns: (Optional[List[str]]) Filter columns to be included.  By default, all columns are included.
    :param wrap: (Optional[int or 'stripes']) If set to 'stripes' then inspect prints rows in stripes; if set to an
                 integer N, rows will be printed in clumps of N columns, where the columns are wrapped.
    :param truncate: (Optional[int]) If set to integer N, all strings will be truncated to length N, including all
                     tagged ellipses.
    :param round: (Optional[int]) If set to integer N, all floating point numbers will be rounded and truncated to
                  N digits.
    :param width: (Optional[int]) If set to integer N, the print out will try to honor a max line width of N.
    :param margin: (Optional[int]) Applies to 'stripes' mode only.  If set to integer N, the margin for printing names
                   in a stripe will be limited to N characters.
    :param with_types: (Optinoal[bool]) If set to True, header will include the data_type of each column.
    :return: (RowsInspection) An object which naturally converts to a pretty-print string.

    Examples
    --------
    To look at the first 4 rows of data in a frame:

        >>> frame.inspect(4)
        [#]  animal    name    age  weight
        ==================================
        [0]  human     George    8   542.5
        [1]  human     Ursula    6   495.0
        [2]  ape       Ape      41   400.0
        [3]  elephant  Shep      5  8630.0

    # For other examples, see :ref:`example_frame.inspect`.

    Note: if the frame data contains unicode characters, this method may raise a Unicode exception when
    running in an interactive REPL or otherwise which triggers the standard python repr().  To get around
    this problem, explicitly print the unicode of the returned object:

        >>> print unicode(frame.inspect())


    **Global Settings**

    If not specified, the arguments that control formatting receive default values from
    'sparktk.inspect_settings'.  Make changes there to affect all calls to inspect.

        >>> import sparktk
        >>> sparktk.inspect_settings
        wrap             20
        truncate       None
        round          None
        width            80
        margin         None
        with_types    False
        >>> sparktk.inspect_settings.width = 120  # changes inspect to use 120 width globally
        >>> sparktk.inspect_settings.truncate = 16  # changes inspect to always truncate strings to 16 chars
        >>> sparktk.inspect_settings
        wrap             20
        truncate         16
        round          None
        width           120
        margin         None
        with_types    False
        >>> sparktk.inspect_settings.width = None  # return value back to default
        >>> sparktk.inspect_settings
        wrap             20
        truncate         16
        round          None
        width            80
        margin         None
        with_types    False
        >>> sparktk.inspect_settings.reset()  # set everything back to default
        >>> sparktk.inspect_settings
        wrap             20
        truncate       None
        round          None
        width            80
        margin         None
        with_types    False

    """
    from sparktk.frame.ops.take import take_rich
    format_settings = inspect_settings.copy(wrap, truncate, round, width, margin, with_types)
    result = take_rich(self, n, offset, columns)
    return RowsInspection(result.data, result.schema, offset=offset, format_settings=format_settings)

def join_inner(

self, right, left_on, right_on=None, use_broadcast=None)

join_inner performs inner join operation on one or two frames, creating a new frame.

Parameters:

right

(Frame):

Another frame to join with

left_on

(List[str]):

Names of the columns in the left frame used to match up the two frames.

right_on

(Optional[List[str]]):

Names of the columns in the right frame used to match up the two frames. Default is the same as the left frame.

use_broadcast

(Optional[str]):

If one of your tables is small enough to fit in the memory of a single machine, you can use a broadcast join. Specify that table to broadcast (left or right) to possibly improve performance. Default is None.

:returns: (Frame) A new frame with the results of the join

Create a new frame from a SQL JOIN operation with another frame. The frame on the 'left' is the currently active frame. The frame on the 'right' is another frame. This method take column(s) in the left frame and matches its values with column(s) in the right frame. 'inner' join will only allow data in the resultant frame if both the left and right frames have the same value in the matching column(s).

Notes:

When a column is named the same in both frames, it will result in two columns in the new frame. The column from the left frame (originally the current frame) will be copied and the column name will have the string "_L" added to it. The same thing will happen with the column from the right frame, except its name has the string "_R" appended. The order of columns after this method is called is not guaranteed.

It is recommended that you rename the columns to meaningful terms prior to using the join method.

Examples:

Consider two frames: codes and colors

codes.inspect() [#] numbers ============ [0] 1 [1] 3 [2] 1 [3] 0 [4] 2 [5] 1 [6] 5 [7] 3

colors.inspect() [#] numbers color ==================== [0] 1 red [1] 2 yellow [2] 3 green [3] 4 blue

Inner join using hash joins.

j = codes.join_inner(colors, 'numbers') [===Job Progress===]

j.inspect() [#] numbers color ==================== [0] 1 red [1] 1 red [2] 1 red [3] 2 yellow [4] 3 green [5] 3 green

(The join adds an extra column *_R which is the join column from the right frame; it may be disregarded)

Consider two frames: country_codes_frame and country_names_frame

country_codes_frame.inspect() [#] country_code area_code test_str ====================================== [0] 1 354 a [1] 2 91 a [2] 2 100 b [3] 3 47 a [4] 4 968 c [5] 5 50 c

country_names_frame.inspect() [#] country_code country_name test_str ========================================= [0] 1 Iceland a [1] 1 Ice-land a [2] 2 India b [3] 3 Norway a [4] 4 Oman c [5] 6 Germany c

Join them on the 'country_code' and 'test_str' columns ('inner' join by default)

composite_join = country_codes_frame.join_inner(country_names_frame, ['country_code', 'test_str']) [===Job Progress===]

composite_join.inspect() [#] country_code area_code test_str country_name ==================================================== [0] 1 354 a Iceland [1] 1 354 a Ice-land [2] 2 100 b India [3] 3 47 a Norway [4] 4 968 c Oman

Inner join broadcasting the left table

j = codes.join_inner(colors, 'numbers',use_broadcast="left") [===Job Progress===]

j.inspect() [#] numbers color ==================== [0] 1 red [1] 1 red [2] 1 red [3] 2 yellow [4] 3 green [5] 3 green

composite_join_left = country_codes_frame.join_inner(country_names_frame, ['country_code', 'test_str'],use_broadcast="left") [===Job Progress===]

composite_join_left.inspect() [#] country_code area_code test_str country_name ==================================================== [0] 1 354 a Iceland [1] 1 354 a Ice-land [2] 2 100 b India [3] 3 47 a Norway [4] 4 968 c Oman

Inner join broadcasting right table

j = codes.join_inner(colors, 'numbers',use_broadcast="right") [===Job Progress===]

j.inspect() [#] numbers color ==================== [0] 1 red [1] 3 green [2] 1 red [3] 2 yellow [4] 1 red [5] 3 green

composite_join_right = country_codes_frame.join_inner(country_names_frame, ['country_code', 'test_str'],use_broadcast="right") [===Job Progress===]

composite_join_right.inspect() [#] country_code area_code test_str country_name ==================================================== [0] 1 354 a Iceland [1] 1 354 a Ice-land [2] 2 100 b India [3] 3 47 a Norway [4] 4 968 c Oman

Show source ≡

def join_inner(self,
               right,
               left_on,
               right_on=None,
               use_broadcast=None):
    """
    join_inner performs inner join operation on one or two frames, creating a new frame.


    Parameters
    ----------

    :param right: (Frame) Another frame to join with
    :param left_on: (List[str]) Names of the columns in the left frame used to match up the two frames.
    :param right_on: (Optional[List[str]]) Names of the columns in the right frame used to match up the two frames. Default is the same as the left frame.
    :param use_broadcast: (Optional[str]) If one of your tables is small enough to fit in the memory of a single machine, you can use a broadcast join.
            Specify that table to broadcast (left or right) to possibly improve performance. Default is None.

    :returns: (Frame) A new frame with the results of the join

    Create a new frame from a SQL JOIN operation with another frame.
    The frame on the 'left' is the currently active frame.
    The frame on the 'right' is another frame.
    This method take column(s) in the left frame and matches its values
    with column(s) in the right frame.
    'inner' join will only allow data in the resultant frame if both the left and right frames have the same value
    in the matching column(s).

    Notes
    -----
    When a column is named the same in both frames, it will result in two
    columns in the new frame.
    The column from the *left* frame (originally the current frame) will be
    copied and the column name will have the string "_L" added to it.
    The same thing will happen with the column from the *right* frame,
    except its name has the string "_R" appended. The order of columns
    after this method is called is not guaranteed.

    It is recommended that you rename the columns to meaningful terms prior
    to using the ``join`` method.

    Examples
    --------


    Consider two frames: codes and colors

    >>> codes.inspect()
    [#]  numbers
    ============
    [0]        1
    [1]        3
    [2]        1
    [3]        0
    [4]        2
    [5]        1
    [6]        5
    [7]        3


    >>> colors.inspect()
    [#]  numbers  color
    ====================
    [0]        1  red
    [1]        2  yellow
    [2]        3  green
    [3]        4  blue

    Inner join using hash joins.

    >>> j = codes.join_inner(colors, 'numbers')
    [===Job Progress===]

    >>> j.inspect()
    [#]  numbers  color
    ====================
    [0]        1  red
    [1]        1  red
    [2]        1  red
    [3]        2  yellow
    [4]        3  green
    [5]        3  green

    (The join adds an extra column *_R which is the join column from the right frame; it may be disregarded)

    Consider two frames: country_codes_frame and country_names_frame

    >>> country_codes_frame.inspect()
    [#]  country_code  area_code  test_str
    ======================================
    [0]             1        354  a
    [1]             2         91  a
    [2]             2        100  b
    [3]             3         47  a
    [4]             4        968  c
    [5]             5         50  c


    >>> country_names_frame.inspect()
    [#]  country_code  country_name  test_str
    =========================================
    [0]             1  Iceland       a
    [1]             1  Ice-land      a
    [2]             2  India         b
    [3]             3  Norway        a
    [4]             4  Oman          c
    [5]             6  Germany       c

    Join them on the 'country_code' and 'test_str' columns ('inner' join by default)

    >>> composite_join = country_codes_frame.join_inner(country_names_frame, ['country_code', 'test_str'])
    [===Job Progress===]

    >>> composite_join.inspect()
    [#]  country_code  area_code  test_str  country_name
    ====================================================
    [0]             1        354  a         Iceland
    [1]             1        354  a         Ice-land
    [2]             2        100  b         India
    [3]             3         47  a         Norway
    [4]             4        968  c         Oman


    Inner join broadcasting the left table

    >>> j = codes.join_inner(colors, 'numbers',use_broadcast="left")
    [===Job Progress===]

    >>> j.inspect()
    [#]  numbers  color
    ====================
    [0]        1  red
    [1]        1  red
    [2]        1  red
    [3]        2  yellow
    [4]        3  green
    [5]        3  green

    >>> composite_join_left = country_codes_frame.join_inner(country_names_frame, ['country_code', 'test_str'],use_broadcast="left")
    [===Job Progress===]

    >>> composite_join_left.inspect()
    [#]  country_code  area_code  test_str  country_name
    ====================================================
    [0]             1        354  a         Iceland
    [1]             1        354  a         Ice-land
    [2]             2        100  b         India
    [3]             3         47  a         Norway
    [4]             4        968  c         Oman

    Inner join broadcasting right table

    >>> j = codes.join_inner(colors, 'numbers',use_broadcast="right")
    [===Job Progress===]

    >>> j.inspect()
    [#]  numbers  color
    ====================
    [0]        1  red
    [1]        3  green
    [2]        1  red
    [3]        2  yellow
    [4]        1  red
    [5]        3  green

    >>> composite_join_right = country_codes_frame.join_inner(country_names_frame, ['country_code', 'test_str'],use_broadcast="right")
    [===Job Progress===]

    >>> composite_join_right.inspect()
    [#]  country_code  area_code  test_str  country_name
    ====================================================
    [0]             1        354  a         Iceland
    [1]             1        354  a         Ice-land
    [2]             2        100  b         India
    [3]             3         47  a         Norway
    [4]             4        968  c         Oman

    """

    if left_on is None:
        raise ValueError("Please provide column name on which join should be performed")
    elif isinstance(left_on, basestring):
        left_on = [left_on]
    if right_on is None:
        right_on = left_on
    elif isinstance(right_on, basestring):
        right_on = [right_on]
    if len(left_on) != len(right_on):
        raise ValueError("Please provide equal number of join columns")

    from sparktk.frame.frame import Frame
    return Frame(self._tc, self._scala.joinInner(right._scala,
                                                 self._tc.jutils.convert.to_scala_list_string(left_on),
                                                 self._tc.jutils.convert.to_scala_option(
                                                     self._tc.jutils.convert.to_scala_list_string(right_on)),
                                                 self._tc.jutils.convert.to_scala_option(use_broadcast)))

def join_left(

self, right, left_on, right_on=None, use_broadcast_right=False)

join_left performs left join(Left outer) operation on one or two frames, creating a new frame.

Parameters:

right

(Frame):

Another frame to join with

left_on

(List[str]):

Names of the columns in the left frame used to match up the two frames.

right_on

(Optional[List[str]]):

Names of the columns in the right frame used to match up the two frames. Default is the same as the left frame.

use_broadcast_right

(bool):

If right table is small enough to fit in the memory of a single machine, you can set use_broadcast_right to True to possibly improve performance using broadcast join. Default is False.

:returns: (Frame) A new frame with the results of the join

Create a new frame from a SQL JOIN operation with another frame. The frame on the 'left' is the currently active frame. The frame on the 'right' is another frame. This method take column(s) in the left frame and matches its values with column(s) in the right frame. 'left' join will allow any data in the resultant frame if it exists in the left frame, but will allow any data from the right frame if it has a value in its column(s) which matches the value in the left frame column(s).

Notes:

It is recommended that you rename the columns to meaningful terms prior to using the join method.

Examples:

Consider two frames: codes and colors

codes.inspect() [#] numbers ============ [0] 1 [1] 3 [2] 1 [3] 0 [4] 2 [5] 1 [6] 5 [7] 3

colors.inspect() [#] numbers color ==================== [0] 1 red [1] 2 yellow [2] 3 green [3] 4 blue

j_left = codes.join_left(colors, 'numbers') [===Job Progress===]

j_left.inspect() [#] numbers_L color ====================== [0] 0 None [1] 1 red [2] 1 red [3] 1 red [4] 2 yellow [5] 3 green [6] 3 green [7] 5 None

(The join adds an extra column *_R which is the join column from the right frame; it may be disregarded)

Consider two frames: country_codes_frame and country_names_frame

country_codes_frame.inspect() [#] country_code area_code test_str ====================================== [0] 1 354 a [1] 2 91 a [2] 2 100 b [3] 3 47 a [4] 4 968 c [5] 5 50 c

country_names_frame.inspect() [#] country_code country_name test_str ========================================= [0] 1 Iceland a [1] 1 Ice-land a [2] 2 India b [3] 3 Norway a [4] 4 Oman c [5] 6 Germany c

Join them on the 'country_code' and 'test_str' columns ('inner' join by default)

composite_join_left = country_codes_frame.join_left(country_names_frame, ['country_code', 'test_str']) [===Job Progress===]

composite_join_left.inspect() [#] country_code_L area_code test_str_L country_name ======================================================== [0] 1 354 a Iceland [1] 1 354 a Ice-land [2] 2 91 a None [3] 2 100 b India [4] 3 47 a Norway [5] 4 968 c Oman [6] 5 50 c None

Left join broadcasting right table

j_left = codes.join_left(colors, 'numbers', use_broadcast_right=True) [===Job Progress===]

j_left.inspect() [#] numbers_L color ====================== [0] 1 red [1] 3 green [2] 1 red [3] 0 None [4] 2 yellow [5] 1 red [6] 5 None [7] 3 green

composite_join_left = country_codes_frame.join_left(country_names_frame, ['country_code', 'test_str'], use_broadcast_right=True) [===Job Progress===]

composite_join_left.inspect() [#] country_code_L area_code test_str_L country_name ======================================================== [0] 1 354 a Iceland [1] 1 354 a Ice-land [2] 2 91 a None [3] 2 100 b India [4] 3 47 a Norway [5] 4 968 c Oman [6] 5 50 c None

Show source ≡

def join_left(self,
              right,
              left_on,
              right_on=None,
              use_broadcast_right=False):
    """
    join_left performs left join(Left outer) operation on one or two frames, creating a new frame.


    Parameters
    ---------

    :param right: (Frame) Another frame to join with
    :param left_on: (List[str]) Names of the columns in the left frame used to match up the two frames.
    :param right_on: (Optional[List[str]]) Names of the columns in the right frame used to match up the two frames. Default is the same as the left frame.
    :param use_broadcast_right: (bool) If right table is small enough to fit in the memory of a single machine,
            you can set use_broadcast_right to True to possibly improve performance using broadcast join. Default is False.

    :returns: (Frame) A new frame with the results of the join

    Create a new frame from a SQL JOIN operation with another frame.
    The frame on the 'left' is the currently active frame.
    The frame on the 'right' is another frame.
    This method take column(s) in the left frame and matches its values
    with column(s) in the right frame.
    'left' join will allow any data in the resultant
    frame if it exists in the left frame, but will allow any data from the
    right frame if it has a value in its column(s) which matches the value in
    the left frame column(s).

    Notes
    -----
    When a column is named the same in both frames, it will result in two
    columns in the new frame.
    The column from the *left* frame (originally the current frame) will be
    copied and the column name will have the string "_L" added to it.
    The same thing will happen with the column from the *right* frame,
    except its name has the string "_R" appended. The order of columns
    after this method is called is not guaranteed.

    It is recommended that you rename the columns to meaningful terms prior
    to using the ``join`` method.

    Examples
    --------


    Consider two frames: codes and colors

    >>> codes.inspect()
    [#]  numbers
    ============
    [0]        1
    [1]        3
    [2]        1
    [3]        0
    [4]        2
    [5]        1
    [6]        5
    [7]        3


    >>> colors.inspect()
    [#]  numbers  color
    ====================
    [0]        1  red
    [1]        2  yellow
    [2]        3  green
    [3]        4  blue

    >>> j_left = codes.join_left(colors, 'numbers')
    [===Job Progress===]

    >>> j_left.inspect()
    [#]  numbers_L  color
    ======================
    [0]          0  None
    [1]          1  red
    [2]          1  red
    [3]          1  red
    [4]          2  yellow
    [5]          3  green
    [6]          3  green
    [7]          5  None

    (The join adds an extra column *_R which is the join column from the right frame; it may be disregarded)

    Consider two frames: country_codes_frame and country_names_frame

    >>> country_codes_frame.inspect()
    [#]  country_code  area_code  test_str
    ======================================
    [0]             1        354  a
    [1]             2         91  a
    [2]             2        100  b
    [3]             3         47  a
    [4]             4        968  c
    [5]             5         50  c


    >>> country_names_frame.inspect()
    [#]  country_code  country_name  test_str
    =========================================
    [0]             1  Iceland       a
    [1]             1  Ice-land      a
    [2]             2  India         b
    [3]             3  Norway        a
    [4]             4  Oman          c
    [5]             6  Germany       c

    Join them on the 'country_code' and 'test_str' columns  ('inner' join by default)

    >>> composite_join_left = country_codes_frame.join_left(country_names_frame, ['country_code', 'test_str'])
    [===Job Progress===]

    >>> composite_join_left.inspect()
    [#]  country_code_L  area_code  test_str_L  country_name
    ========================================================
    [0]               1        354  a           Iceland
    [1]               1        354  a           Ice-land
    [2]               2         91  a           None
    [3]               2        100  b           India
    [4]               3         47  a           Norway
    [5]               4        968  c           Oman
    [6]               5         50  c           None

    Left join broadcasting right table

    >>> j_left = codes.join_left(colors, 'numbers', use_broadcast_right=True)
    [===Job Progress===]

    >>> j_left.inspect()
    [#]  numbers_L  color
    ======================
    [0]          1  red
    [1]          3  green
    [2]          1  red
    [3]          0  None
    [4]          2  yellow
    [5]          1  red
    [6]          5  None
    [7]          3  green

    >>> composite_join_left = country_codes_frame.join_left(country_names_frame, ['country_code', 'test_str'], use_broadcast_right=True)
    [===Job Progress===]

    >>> composite_join_left.inspect()
    [#]  country_code_L  area_code  test_str_L  country_name
    ========================================================
    [0]               1        354  a           Iceland
    [1]               1        354  a           Ice-land
    [2]               2         91  a           None
    [3]               2        100  b           India
    [4]               3         47  a           Norway
    [5]               4        968  c           Oman
    [6]               5         50  c           None

    """

    if left_on is None:
        raise ValueError("Please provide column name on which join should be performed")
    elif isinstance(left_on, basestring):
        left_on = [left_on]
    if right_on is None:
        right_on = left_on
    elif isinstance(right_on, basestring):
        right_on = [right_on]
    if len(left_on) != len(right_on):
        raise ValueError("Please provide equal number of join columns")

    from sparktk.frame.frame import Frame
    return Frame(self._tc, self._scala.joinLeft(right._scala,
                                                self._tc.jutils.convert.to_scala_list_string(left_on),
                                                self._tc.jutils.convert.to_scala_option(
                                                    self._tc.jutils.convert.to_scala_list_string(right_on)),
                                                use_broadcast_right))

def join_outer(

self, right, left_on, right_on=None)

join_outer performs outer join operation on one or two frames, creating a new frame.

Parameters:

right

(Frame):

Another frame to join with

left_on

(List[str]):

Names of the columns in the left frame used to match up the two frames.

right_on

(Optional[List[str]]):

Names of the columns in the right frame used to match up the two frames. Default is the same as the left frame.

:returns: (Frame) A new frame with the results of the join

Create a new frame from a SQL JOIN operation with another frame. The frame on the 'left' is the currently active frame. The frame on the 'right' is another frame. This method take column(s) in the left frame and matches its values with column(s) in the right frame. The 'outer' join provides a frame with data from both frames where the left and right frames did not have the same value in the matching column(s).

Notes:

It is recommended that you rename the columns to meaningful terms prior to using the join method.

Examples:

Consider two frames: codes and colors

codes.inspect() [#] numbers ============ [0] 1 [1] 3 [2] 1 [3] 0 [4] 2 [5] 1 [6] 5 [7] 3

colors.inspect() [#] numbers color ==================== [0] 1 red [1] 2 yellow [2] 3 green [3] 4 blue

Join them on the 'numbers' column ('inner' join by default)

j_outer = codes.join_outer(colors, 'numbers') [===Job Progress===]

j_outer.inspect() [#] numbers_L color ====================== [0] 0 None [1] 1 red [2] 1 red [3] 1 red [4] 2 yellow [5] 3 green [6] 3 green [7] 4 blue [8] 5 None

(The join adds an extra column *_R which is the join column from the right frame; it may be disregarded)

Consider two frames: country_codes_frame and country_names_frame

country_codes_frame.inspect() [#] country_code area_code test_str ====================================== [0] 1 354 a [1] 2 91 a [2] 2 100 b [3] 3 47 a [4] 4 968 c [5] 5 50 c

country_names_frame.inspect() [#] country_code country_name test_str ========================================= [0] 1 Iceland a [1] 1 Ice-land a [2] 2 India b [3] 3 Norway a [4] 4 Oman c [5] 6 Germany c

Join them on the 'country_code' and 'test_str' columns ('inner' join by default)

composite_join_outer = country_codes_frame.join_outer(country_names_frame, ['country_code', 'test_str']) [===Job Progress===]

composite_join_outer.inspect() [#] country_code_L area_code test_str_L country_name ======================================================== [0] 6 None c Germany [1] 1 354 a Iceland [2] 1 354 a Ice-land [3] 2 91 a None [4] 2 100 b India [5] 3 47 a Norway [6] 4 968 c Oman [7] 5 50 c None

Show source ≡

def join_outer(self,
               right,
               left_on,
               right_on=None):
    """
    join_outer performs outer join operation on one or two frames, creating a new frame.


    Parameters
    ----------

    :param right: (Frame) Another frame to join with
    :param left_on: (List[str]) Names of the columns in the left frame used to match up the two frames.
    :param right_on: (Optional[List[str]]) Names of the columns in the right frame used to match up the two frames. Default is the same as the left frame.

    :returns: (Frame) A new frame with the results of the join

    Create a new frame from a SQL JOIN operation with another frame.
    The frame on the 'left' is the currently active frame.
    The frame on the 'right' is another frame.
    This method take column(s) in the left frame and matches its values
    with column(s) in the right frame.
    The 'outer' join provides a frame with data from both frames where
    the left and right frames did not have the same value in the matching
    column(s).

    Notes
    -----
    When a column is named the same in both frames, it will result in two
    columns in the new frame.
    The column from the *left* frame (originally the current frame) will be
    copied and the column name will have the string "_L" added to it.
    The same thing will happen with the column from the *right* frame,
    except its name has the string "_R" appended. The order of columns
    after this method is called is not guaranteed.

    It is recommended that you rename the columns to meaningful terms prior
    to using the ``join`` method.

    Examples
    --------


    Consider two frames: codes and colors

    >>> codes.inspect()
    [#]  numbers
    ============
    [0]        1
    [1]        3
    [2]        1
    [3]        0
    [4]        2
    [5]        1
    [6]        5
    [7]        3


    >>> colors.inspect()
    [#]  numbers  color
    ====================
    [0]        1  red
    [1]        2  yellow
    [2]        3  green
    [3]        4  blue


    Join them on the 'numbers' column ('inner' join by default)

    >>> j_outer = codes.join_outer(colors, 'numbers')
    [===Job Progress===]

    >>> j_outer.inspect()
    [#]  numbers_L  color
    ======================
    [0]          0  None
    [1]          1  red
    [2]          1  red
    [3]          1  red
    [4]          2  yellow
    [5]          3  green
    [6]          3  green
    [7]          4  blue
    [8]          5  None


    (The join adds an extra column *_R which is the join column from the right frame; it may be disregarded)

    Consider two frames: country_codes_frame and country_names_frame

    >>> country_codes_frame.inspect()
    [#]  country_code  area_code  test_str
    ======================================
    [0]             1        354  a
    [1]             2         91  a
    [2]             2        100  b
    [3]             3         47  a
    [4]             4        968  c
    [5]             5         50  c


    >>> country_names_frame.inspect()
    [#]  country_code  country_name  test_str
    =========================================
    [0]             1  Iceland       a
    [1]             1  Ice-land      a
    [2]             2  India         b
    [3]             3  Norway        a
    [4]             4  Oman          c
    [5]             6  Germany       c

    Join them on the 'country_code' and 'test_str' columns  ('inner' join by default)

    >>> composite_join_outer = country_codes_frame.join_outer(country_names_frame, ['country_code', 'test_str'])
    [===Job Progress===]

    >>> composite_join_outer.inspect()
    [#]  country_code_L  area_code  test_str_L  country_name
    ========================================================
    [0]               6       None  c           Germany
    [1]               1        354  a           Iceland
    [2]               1        354  a           Ice-land
    [3]               2         91  a           None
    [4]               2        100  b           India
    [5]               3         47  a           Norway
    [6]               4        968  c           Oman
    [7]               5         50  c           None

    """

    if left_on is None:
        raise ValueError("Please provide column name on which join should be performed")
    elif isinstance(left_on, basestring):
        left_on = [left_on]
    if right_on is None:
        right_on = left_on
    elif isinstance(right_on, basestring):
        right_on = [right_on]
    if len(left_on) != len(right_on):
        raise ValueError("Please provide equal number of join columns")

    from sparktk.frame.frame import Frame
    return Frame(self._tc, self._scala.joinOuter(right._scala,
                                                 self._tc.jutils.convert.to_scala_list_string(left_on),
                                                 self._tc.jutils.convert.to_scala_option(
                                                     self._tc.jutils.convert.to_scala_list_string(right_on))))

def join_right(

self, right, left_on, right_on=None, use_broadcast_left=False)

join_right performs right join(right outer) operation on one or two frames, creating a new frame.

Parameters:

right

(Frame):

Another frame to join with

left_on

(List[str]):

Names of the columns in the left frame used to match up the two frames.

right_on

(Optional[List[str]])Names of the columns in the right frame used to match up the two frames. Default is the same as the left frame.

use_broadcast_left:

(bool):

If left table is small enough to fit in the memory of a single machine, you can set use_broadcast_left to True to possibly improve performance using broadcast join. Default is False.

:returns: (Frame) A new frame with the results of the join

Create a new frame from a SQL JOIN operation with another frame. The frame on the 'left' is the currently active frame. The frame on the 'right' is another frame. This method take column(s) in the left frame and matches its values with column(s) in the right frame. 'right' join works similarly to join_left, except it keeps all the data from the right frame and only the data from the left frame when it matches.

Notes:

It is recommended that you rename the columns to meaningful terms prior to using the join method.

Examples:

Consider two frames: codes and colors

codes.inspect() [#] numbers ============ [0] 1 [1] 3 [2] 1 [3] 0 [4] 2 [5] 1 [6] 5 [7] 3

colors.inspect() [#] numbers color ==================== [0] 1 red [1] 2 yellow [2] 3 green [3] 4 blue

j_right = codes.join_right(colors, 'numbers') [===Job Progress===]

j_right.inspect() [#] numbers_R color ====================== [0] 1 red [1] 1 red [2] 1 red [3] 2 yellow [4] 3 green [5] 3 green [6] 4 blue

(The join adds an extra column *_R which is the join column from the right frame; it may be disregarded)

Consider two frames: country_codes_frame and country_names_frame

country_codes_frame.inspect() [#] country_code area_code test_str ====================================== [0] 1 354 a [1] 2 91 a [2] 2 100 b [3] 3 47 a [4] 4 968 c [5] 5 50 c

country_names_frame.inspect() [#] country_code country_name test_str ========================================= [0] 1 Iceland a [1] 1 Ice-land a [2] 2 India b [3] 3 Norway a [4] 4 Oman c [5] 6 Germany c

Join them on the 'country_code' and 'test_str' columns ('inner' join by default)

composite_join_right = country_codes_frame.join_right(country_names_frame, ['country_code', 'test_str']) [===Job Progress===]

composite_join_right.inspect() [#] area_code country_code_R country_name test_str_R ======================================================== [0] None 6 Germany c [1] 354 1 Iceland a [2] 354 1 Ice-land a [3] 100 2 India b [4] 47 3 Norway a [5] 968 4 Oman c

Right join broadcasting left table

j_right = codes.join_right(colors, 'numbers', use_broadcast_left=True) [===Job Progress===]

j_right.inspect() [#] numbers_R color ====================== [0] 1 red [1] 1 red [2] 1 red [3] 2 yellow [4] 3 green [5] 3 green [6] 4 blue

composite_join_right = country_codes_frame.join_right(country_names_frame, ['country_code', 'test_str'], use_broadcast_left=True) [===Job Progress===]

composite_join_right.inspect() [#] area_code country_code_R country_name test_str_R ======================================================== [0] 354 1 Iceland a [1] 354 1 Ice-land a [2] 100 2 India b [3] 47 3 Norway a [4] 968 4 Oman c [5] None 6 Germany c

Show source ≡

def join_right(self,
              right,
              left_on,
              right_on=None,
              use_broadcast_left=False):
    """
    join_right performs right join(right outer) operation on one or two frames, creating a new frame.


    Parameters
    ----------

    :param right: (Frame) Another frame to join with
    :param left_on: (List[str]) Names of the columns in the left frame used to match up the two frames.
    :param right_on: (Optional[List[str]])Names of the columns in the right frame used to match up the two frames. Default is the same as the left frame.
    :param use_broadcast_left: (bool) If left table is small enough to fit in the memory of a single machine,
            you can set use_broadcast_left to True to possibly improve performance using broadcast join. Default is False.

    :returns: (Frame) A new frame with the results of the join

    Create a new frame from a SQL JOIN operation with another frame.
    The frame on the 'left' is the currently active frame.
    The frame on the 'right' is another frame.
    This method take column(s) in the left frame and matches its values
    with column(s) in the right frame.
    'right' join works similarly to join_left, except it keeps all the data
    from the right frame and only the data from the left frame when it
    matches.

    Notes
    -----
    When a column is named the same in both frames, it will result in two
    columns in the new frame.
    The column from the *left* frame (originally the current frame) will be
    copied and the column name will have the string "_L" added to it.
    The same thing will happen with the column from the *right* frame,
    except its name has the string "_R" appended. The order of columns
    after this method is called is not guaranteed.

    It is recommended that you rename the columns to meaningful terms prior
    to using the ``join`` method.

    Examples
    --------


    Consider two frames: codes and colors

    >>> codes.inspect()
    [#]  numbers
    ============
    [0]        1
    [1]        3
    [2]        1
    [3]        0
    [4]        2
    [5]        1
    [6]        5
    [7]        3


    >>> colors.inspect()
    [#]  numbers  color
    ====================
    [0]        1  red
    [1]        2  yellow
    [2]        3  green
    [3]        4  blue

    >>> j_right = codes.join_right(colors, 'numbers')
    [===Job Progress===]

    >>> j_right.inspect()
    [#]  numbers_R  color
    ======================
    [0]          1  red
    [1]          1  red
    [2]          1  red
    [3]          2  yellow
    [4]          3  green
    [5]          3  green
    [6]          4  blue


    (The join adds an extra column *_R which is the join column from the right frame; it may be disregarded)

    Consider two frames: country_codes_frame and country_names_frame

    >>> country_codes_frame.inspect()
    [#]  country_code  area_code  test_str
    ======================================
    [0]             1        354  a
    [1]             2         91  a
    [2]             2        100  b
    [3]             3         47  a
    [4]             4        968  c
    [5]             5         50  c


    >>> country_names_frame.inspect()
    [#]  country_code  country_name  test_str
    =========================================
    [0]             1  Iceland       a
    [1]             1  Ice-land      a
    [2]             2  India         b
    [3]             3  Norway        a
    [4]             4  Oman          c
    [5]             6  Germany       c

    Join them on the 'country_code' and 'test_str' columns ('inner' join by default)

    >>> composite_join_right = country_codes_frame.join_right(country_names_frame, ['country_code', 'test_str'])
    [===Job Progress===]

    >>> composite_join_right.inspect()
    [#]  area_code  country_code_R  country_name  test_str_R
    ========================================================
    [0]       None               6  Germany       c
    [1]        354               1  Iceland       a
    [2]        354               1  Ice-land      a
    [3]        100               2  India         b
    [4]         47               3  Norway        a
    [5]        968               4  Oman          c

    Right join broadcasting left table

    >>> j_right = codes.join_right(colors, 'numbers', use_broadcast_left=True)
    [===Job Progress===]

    >>> j_right.inspect()
    [#]  numbers_R  color
    ======================
    [0]          1  red
    [1]          1  red
    [2]          1  red
    [3]          2  yellow
    [4]          3  green
    [5]          3  green
    [6]          4  blue

    >>> composite_join_right = country_codes_frame.join_right(country_names_frame, ['country_code', 'test_str'], use_broadcast_left=True)
    [===Job Progress===]

    >>> composite_join_right.inspect()
    [#]  area_code  country_code_R  country_name  test_str_R
    ========================================================
    [0]        354               1  Iceland       a
    [1]        354               1  Ice-land      a
    [2]        100               2  India         b
    [3]         47               3  Norway        a
    [4]        968               4  Oman          c
    [5]       None               6  Germany       c

    """
    if left_on is None:
        raise ValueError("Please provide column name on which join should be performed")
    elif isinstance(left_on, basestring):
        left_on = [left_on]
    if right_on is None:
        right_on = left_on
    elif isinstance(right_on, basestring):
        right_on = [right_on]
    if len(left_on) != len(right_on):
        raise ValueError("Please provide equal number of join columns")

    from sparktk.frame.frame import Frame
    return Frame(self._tc, self._scala.joinRight(right._scala,
                                                 self._tc.jutils.convert.to_scala_list_string(left_on),
                                                 self._tc.jutils.convert.to_scala_option(
                                                     self._tc.jutils.convert.to_scala_list_string(right_on)),
                                                 use_broadcast_left))

def map_columns(

self, func, schema)

Create a new frame from the output of a UDF which over each row of the current frame.

Notes:

The row |UDF| ('func') must return a value in the same format as specified by the schema.

Parameters:

func

(UDF):

Function which takes the values in the row and produces a value, or collection of values, for the new cell(s).

schema

(List[(str,type)]):

Schema for the column(s) being added.

Examples:

Given our frame, let's create a new frame with the name and a column with how many years the person has been over 18

>>> frame = tc.frame.create([['Fred',39,16,'555-1234'],
...                          ['Susan',33,3,'555-0202'],
...                          ['Thurston',65,26,'555-4510'],
...                          ['Judy',44,14,'555-2183']],
...                         schema=[('name', str), ('age', int), ('tenure', int), ('phone', str)])

>>> frame.inspect()
[#]  name      age  tenure  phone
====================================
[0]  Fred       39      16  555-1234
[1]  Susan      33       3  555-0202
[2]  Thurston   65      26  555-4510
[3]  Judy       44      14  555-2183

>>> adult = frame.map_columns(lambda row: [row.name, row.age - 18], [('name', str), ('adult_years', int)])

>>> adult.inspect()
[#]  name      adult_years
==========================
[0]  Fred               21
[1]  Susan              15
[2]  Thurston           47
[3]  Judy               26

Note that the function returns a list, and therefore the schema also needs to be a list.

It is not necessary to use lambda syntax, any function will do, as long as it takes a single row argument. We can also call other local functions within.

(see also the 'add_columns' frame operation)

Show source ≡

def map_columns(self, func, schema):
    """
    Create a new frame from the output of a UDF which over each row of the current frame.

    Notes
    -----

    1.  The row |UDF| ('func') must return a value in the same format as
        specified by the schema.

    Parameters
    ----------

    :param func: (UDF) Function which takes the values in the row and produces a value, or collection of values, for the new cell(s).
    :param schema: (List[(str,type)]) Schema for the column(s) being added.

    Examples
    --------

    Given our frame, let's create a new frame with the name and a column with how many years the person has been over 18

        >>> frame = tc.frame.create([['Fred',39,16,'555-1234'],
        ...                          ['Susan',33,3,'555-0202'],
        ...                          ['Thurston',65,26,'555-4510'],
        ...                          ['Judy',44,14,'555-2183']],
        ...                         schema=[('name', str), ('age', int), ('tenure', int), ('phone', str)])

        >>> frame.inspect()
        [#]  name      age  tenure  phone
        ====================================
        [0]  Fred       39      16  555-1234
        [1]  Susan      33       3  555-0202
        [2]  Thurston   65      26  555-4510
        [3]  Judy       44      14  555-2183

        >>> adult = frame.map_columns(lambda row: [row.name, row.age - 18], [('name', str), ('adult_years', int)])

        >>> adult.inspect()
        [#]  name      adult_years
        ==========================
        [0]  Fred               21
        [1]  Susan              15
        [2]  Thurston           47
        [3]  Judy               26


    Note that the function returns a list, and therefore the schema also needs to be a list.

    It is not necessary to use lambda syntax, any function will do, as long as it takes a single row argument.  We
    can also call other local functions within.

    (see also the 'add_columns' frame operation)
    """

    schema_helper.validate(schema)
    row = Row(self.schema)

    def map_columns_func(r):
        row._set_data(r)
        return func(row)
    if isinstance(schema, list):
        rdd = self._python.rdd.map(lambda r: map_columns_func(r))
    else:
        rdd = self._python.rdd.map(lambda r: [map_columns_func(r)])
    return self._tc.frame.create(rdd, schema)

def matrix_covariance_matrix(

self, matrix_column_name)

Compute the Covariance Matrix of matrices stored in a frame

Parameters:

matrix_column_name:

Name of the column to compute the covariance matrix on

Returns

(Frame):

returns the frame with a new column storing the covariance matrix for the corresponding matrix

Calculate the covariance matrix for each matrix in column 'matrix_column_name' of a frame using the following:

Element (i,j) of the covariance matrix for a given matrix X is computed as: ((Xi - Mi)(Xj - Mj)) where Mi is the mean

Examples:

>>> from sparktk import dtypes
>>> data = [[1, [[1,2,3,5],[2,3,5,6],[4,6,7,3],[8,9,2,4]]]]
>>> schema = [('id', int),('pixeldata', dtypes.matrix)]
>>> my_frame = tc.frame.create(data, schema)

>>> my_frame.inspect()
[#]  id  pixeldata
============================
[0]   1  [[ 1.  2.  3.  5.]
[ 2.  3.  5.  6.]
[ 4.  6.  7.  3.]
[ 8.  9.  2.  4.]]


Compute the covariance matrix for the matrices in 'pixeldata' column of the frame
>>> my_frame.matrix_covariance_matrix('pixeldata')

A new column gets added to the existing frame storing the covariance matrix
>>> my_frame.inspect()
[#]  id  pixeldata
============================
[0]   1  [[ 1.  2.  3.  5.]
[ 2.  3.  5.  6.]
[ 4.  6.  7.  3.]
[ 8.  9.  2.  4.]]
<BLANKLINE>
[#]  CovarianceMatrix_pixeldata
============================================================
[0]  [[  2.91666667   3.          -1.          -3.75      ]
[  3.           3.33333333  -0.33333333  -5.        ]
[ -1.          -0.33333333   3.33333333  -1.        ]
[ -3.75        -5.          -1.          10.91666667]]

Show source ≡

def matrix_covariance_matrix(self, matrix_column_name):

    """

    Compute the Covariance Matrix of matrices stored in a frame

    Parameters
    ----------

    :param matrix_column_name: Name of the column to compute the covariance matrix on
    :return: (Frame) returns the frame with a new column storing the covariance matrix for the corresponding matrix

    Calculate the covariance matrix for each matrix in column 'matrix_column_name' of a frame using the following:

    Element (i,j) of the covariance matrix for a given matrix X is computed as: ((Xi - Mi)(Xj - Mj))
    where Mi is the mean

    Examples
    --------
        >>> from sparktk import dtypes
        >>> data = [[1, [[1,2,3,5],[2,3,5,6],[4,6,7,3],[8,9,2,4]]]]
        >>> schema = [('id', int),('pixeldata', dtypes.matrix)]
        >>> my_frame = tc.frame.create(data, schema)

        >>> my_frame.inspect()
        [#]  id  pixeldata
        ============================
        [0]   1  [[ 1.  2.  3.  5.]
        [ 2.  3.  5.  6.]
        [ 4.  6.  7.  3.]
        [ 8.  9.  2.  4.]]


        Compute the covariance matrix for the matrices in 'pixeldata' column of the frame
        >>> my_frame.matrix_covariance_matrix('pixeldata')

        A new column gets added to the existing frame storing the covariance matrix
        >>> my_frame.inspect()
        [#]  id  pixeldata
        ============================
        [0]   1  [[ 1.  2.  3.  5.]
        [ 2.  3.  5.  6.]
        [ 4.  6.  7.  3.]
        [ 8.  9.  2.  4.]]
        
        [#]  CovarianceMatrix_pixeldata
        ============================================================
        [0]  [[  2.91666667   3.          -1.          -3.75      ]
        [  3.           3.33333333  -0.33333333  -5.        ]
        [ -1.          -0.33333333   3.33333333  -1.        ]
        [ -3.75        -5.          -1.          10.91666667]]

    """

    self._scala.matrixCovarianceMatrix(matrix_column_name)

def matrix_pca(

self, matrix_column_name, v_matrix_column_name)

Compute the Principal Component Analysis of a matrix

Parameters:

matrix_column_name:

Name of the column storing the matrices whose principal components are to be computed

v_matrix_column_name:

Name of the column storing the V matrix

Returns

(Frame):

returns the frame with new column storing the principal components for the corresponding matrix

Calculate the Principal Components for each matrix in column 'matrix_column_name' using the V matrix

Examples:

>>> from sparktk import dtypes
>>> data = [[1, [[1,2,3,5],[2,3,5,6],[4,6,7,3],[8,9,2,4]]]]
>>> schema = [('id', int),('pixeldata', dtypes.matrix)]
>>> my_frame = tc.frame.create(data, schema)

>>> my_frame.inspect()
[#]  id  pixeldata
============================
[0]   1  [[ 1.  2.  3.  5.]
[ 2.  3.  5.  6.]
[ 4.  6.  7.  3.]
[ 8.  9.  2.  4.]]

Compute the singular value decomposition for the matrices in 'pixeldata' column of the frame
>>> my_frame.matrix_svd('pixeldata')

Three new columns get added storing the U matrix, V matrix and Singular Vectors
>>> my_frame.inspect()
[#]  id  pixeldata
============================
[0]   1  [[ 1.  2.  3.  5.]
[ 2.  3.  5.  6.]
[ 4.  6.  7.  3.]
[ 8.  9.  2.  4.]]
<BLANKLINE>
[#]  U_pixeldata
========================================================
[0]  [[-0.29128979 -0.43716238 -0.44530839  0.72507913]
[-0.42474933 -0.55066945 -0.26749936 -0.66692972]
[-0.55099141 -0.16785045  0.79986267  0.16868433]
[-0.65661765  0.69099814 -0.30060644 -0.0317899 ]]
<BLANKLINE>
[#]  V_pixeldata
========================================================
[0]  [[-0.47195872  0.50289367 -0.05244699 -0.72222035]
[-0.60780067  0.40702574  0.11313693  0.67239008]
[-0.44835972 -0.58469285  0.65644993 -0.16180641]
[-0.45476024 -0.48945099 -0.74399115  0.01039344]]
<BLANKLINE>
[#]  SingularVectors_pixeldata
============================================================
[0]  [[ 18.21704938   6.59797925   3.54086993   0.26080987]]

Compute the principal components using the V matrices computed for matrices in 'pixeldata'
>>> my_frame.matrix_pca('pixeldata', 'V_pixeldata')

A new column gets added storing the Principal components matrix
>>> my_frame.inspect()
[#]  id  pixeldata
============================
[0]   1  [[ 1.  2.  3.  5.]
[ 2.  3.  5.  6.]
[ 4.  6.  7.  3.]
[ 8.  9.  2.  4.]]
<BLANKLINE>
[#]  U_pixeldata
========================================================
[0]  [[-0.29128979 -0.43716238 -0.44530839  0.72507913]
[-0.42474933 -0.55066945 -0.26749936 -0.66692972]
[-0.55099141 -0.16785045  0.79986267  0.16868433]
[-0.65661765  0.69099814 -0.30060644 -0.0317899 ]]
<BLANKLINE>
[#]  V_pixeldata
========================================================
[0]  [[-0.47195872  0.50289367 -0.05244699 -0.72222035]
[-0.60780067  0.40702574  0.11313693  0.67239008]
[-0.44835972 -0.58469285  0.65644993 -0.16180641]
[-0.45476024 -0.48945099 -0.74399115  0.01039344]]
<BLANKLINE>
[#]  SingularVectors_pixeldata
============================================================
[0]  [[ 18.21704938   6.59797925   3.54086993   0.26080987]]
<BLANKLINE>
[#]  PrincipalComponents_pixeldata
========================================================
[0]  [[-0.47195872  1.00578734 -0.15734098 -3.61110176]
[-1.21560134  1.22107722  0.56568466  4.0343405 ]
[-1.79343888 -3.50815713  4.59514953 -0.48541923]
[-3.63808191 -4.40505888 -1.4879823   0.04157377]]

Show source ≡

def matrix_pca(self, matrix_column_name, v_matrix_column_name):

    """
    Compute the Principal Component Analysis of a matrix

    Parameters
    ----------

    :param matrix_column_name: Name of the column storing the matrices whose principal components are to be computed
    :param v_matrix_column_name: Name of the column storing the V matrix
    :return: (Frame) returns the frame with new column storing the principal components for the corresponding matrix

    Calculate the Principal Components for each matrix in column 'matrix_column_name' using the V matrix

    Examples
    --------
        >>> from sparktk import dtypes
        >>> data = [[1, [[1,2,3,5],[2,3,5,6],[4,6,7,3],[8,9,2,4]]]]
        >>> schema = [('id', int),('pixeldata', dtypes.matrix)]
        >>> my_frame = tc.frame.create(data, schema)

        >>> my_frame.inspect()
        [#]  id  pixeldata
        ============================
        [0]   1  [[ 1.  2.  3.  5.]
        [ 2.  3.  5.  6.]
        [ 4.  6.  7.  3.]
        [ 8.  9.  2.  4.]]

        Compute the singular value decomposition for the matrices in 'pixeldata' column of the frame
        >>> my_frame.matrix_svd('pixeldata')

        Three new columns get added storing the U matrix, V matrix and Singular Vectors
        >>> my_frame.inspect()
        [#]  id  pixeldata
        ============================
        [0]   1  [[ 1.  2.  3.  5.]
        [ 2.  3.  5.  6.]
        [ 4.  6.  7.  3.]
        [ 8.  9.  2.  4.]]
        
        [#]  U_pixeldata
        ========================================================
        [0]  [[-0.29128979 -0.43716238 -0.44530839  0.72507913]
        [-0.42474933 -0.55066945 -0.26749936 -0.66692972]
        [-0.55099141 -0.16785045  0.79986267  0.16868433]
        [-0.65661765  0.69099814 -0.30060644 -0.0317899 ]]
        
        [#]  V_pixeldata
        ========================================================
        [0]  [[-0.47195872  0.50289367 -0.05244699 -0.72222035]
        [-0.60780067  0.40702574  0.11313693  0.67239008]
        [-0.44835972 -0.58469285  0.65644993 -0.16180641]
        [-0.45476024 -0.48945099 -0.74399115  0.01039344]]
        
        [#]  SingularVectors_pixeldata
        ============================================================
        [0]  [[ 18.21704938   6.59797925   3.54086993   0.26080987]]

        Compute the principal components using the V matrices computed for matrices in 'pixeldata'
        >>> my_frame.matrix_pca('pixeldata', 'V_pixeldata')

        A new column gets added storing the Principal components matrix
        >>> my_frame.inspect()
        [#]  id  pixeldata
        ============================
        [0]   1  [[ 1.  2.  3.  5.]
        [ 2.  3.  5.  6.]
        [ 4.  6.  7.  3.]
        [ 8.  9.  2.  4.]]
        
        [#]  U_pixeldata
        ========================================================
        [0]  [[-0.29128979 -0.43716238 -0.44530839  0.72507913]
        [-0.42474933 -0.55066945 -0.26749936 -0.66692972]
        [-0.55099141 -0.16785045  0.79986267  0.16868433]
        [-0.65661765  0.69099814 -0.30060644 -0.0317899 ]]
        
        [#]  V_pixeldata
        ========================================================
        [0]  [[-0.47195872  0.50289367 -0.05244699 -0.72222035]
        [-0.60780067  0.40702574  0.11313693  0.67239008]
        [-0.44835972 -0.58469285  0.65644993 -0.16180641]
        [-0.45476024 -0.48945099 -0.74399115  0.01039344]]
        
        [#]  SingularVectors_pixeldata
        ============================================================
        [0]  [[ 18.21704938   6.59797925   3.54086993   0.26080987]]
        
        [#]  PrincipalComponents_pixeldata
        ========================================================
        [0]  [[-0.47195872  1.00578734 -0.15734098 -3.61110176]
        [-1.21560134  1.22107722  0.56568466  4.0343405 ]
        [-1.79343888 -3.50815713  4.59514953 -0.48541923]
        [-3.63808191 -4.40505888 -1.4879823   0.04157377]]

    """

    self._scala.matrixPca(matrix_column_name, v_matrix_column_name)

def matrix_svd(

self, matrix_column_name)

Compute the Singular Value Decomposition of a matrix

Parameters:

matrix_column_name:

Name of the column to compute the svd on

Returns

(Frame):

returns the frame with three new columns storing the U matrix, V matrix and Singular Vectors

Calculate the Singular Value Decomposition for each matrix in column 'matrix_column_name'

Examples:

>>> from sparktk import dtypes
>>> data = [[1, [[1,2,3,5],[2,3,5,6],[4,6,7,3],[8,9,2,4]]]]
>>> schema = [('id', int),('pixeldata', dtypes.matrix)]
>>> my_frame = tc.frame.create(data, schema)

>>> my_frame.inspect()
[#]  id  pixeldata
============================
[0]   1  [[ 1.  2.  3.  5.]
[ 2.  3.  5.  6.]
[ 4.  6.  7.  3.]
[ 8.  9.  2.  4.]]

Compute the singular value decomposition for the matrices in 'pixeldata' column of the frame
>>> my_frame.matrix_svd('pixeldata')

Three new columns get added storing the U matrix, V matrix and Singular Vectors
>>> my_frame.inspect()
[#]  id  pixeldata
============================
[0]   1  [[ 1.  2.  3.  5.]
[ 2.  3.  5.  6.]
[ 4.  6.  7.  3.]
[ 8.  9.  2.  4.]]
<BLANKLINE>
[#]  U_pixeldata
========================================================
[0]  [[-0.29128979 -0.43716238 -0.44530839  0.72507913]
[-0.42474933 -0.55066945 -0.26749936 -0.66692972]
[-0.55099141 -0.16785045  0.79986267  0.16868433]
[-0.65661765  0.69099814 -0.30060644 -0.0317899 ]]
<BLANKLINE>
[#]  V_pixeldata
========================================================
[0]  [[-0.47195872  0.50289367 -0.05244699 -0.72222035]
[-0.60780067  0.40702574  0.11313693  0.67239008]
[-0.44835972 -0.58469285  0.65644993 -0.16180641]
[-0.45476024 -0.48945099 -0.74399115  0.01039344]]
<BLANKLINE>
[#]  SingularVectors_pixeldata
============================================================
[0]  [[ 18.21704938   6.59797925   3.54086993   0.26080987]]

Show source ≡

def matrix_svd(self, matrix_column_name):

    """

    Compute the Singular Value Decomposition of a matrix

    Parameters
    ----------

    :param matrix_column_name: Name of the column to compute the svd on
    :return: (Frame) returns the frame with three new columns storing the U matrix, V matrix and Singular Vectors

    Calculate the Singular Value Decomposition for each matrix in column 'matrix_column_name'

    Examples
    --------
        >>> from sparktk import dtypes
        >>> data = [[1, [[1,2,3,5],[2,3,5,6],[4,6,7,3],[8,9,2,4]]]]
        >>> schema = [('id', int),('pixeldata', dtypes.matrix)]
        >>> my_frame = tc.frame.create(data, schema)

        >>> my_frame.inspect()
        [#]  id  pixeldata
        ============================
        [0]   1  [[ 1.  2.  3.  5.]
        [ 2.  3.  5.  6.]
        [ 4.  6.  7.  3.]
        [ 8.  9.  2.  4.]]

        Compute the singular value decomposition for the matrices in 'pixeldata' column of the frame
        >>> my_frame.matrix_svd('pixeldata')

        Three new columns get added storing the U matrix, V matrix and Singular Vectors
        >>> my_frame.inspect()
        [#]  id  pixeldata
        ============================
        [0]   1  [[ 1.  2.  3.  5.]
        [ 2.  3.  5.  6.]
        [ 4.  6.  7.  3.]
        [ 8.  9.  2.  4.]]
        
        [#]  U_pixeldata
        ========================================================
        [0]  [[-0.29128979 -0.43716238 -0.44530839  0.72507913]
        [-0.42474933 -0.55066945 -0.26749936 -0.66692972]
        [-0.55099141 -0.16785045  0.79986267  0.16868433]
        [-0.65661765  0.69099814 -0.30060644 -0.0317899 ]]
        
        [#]  V_pixeldata
        ========================================================
        [0]  [[-0.47195872  0.50289367 -0.05244699 -0.72222035]
        [-0.60780067  0.40702574  0.11313693  0.67239008]
        [-0.44835972 -0.58469285  0.65644993 -0.16180641]
        [-0.45476024 -0.48945099 -0.74399115  0.01039344]]
        
        [#]  SingularVectors_pixeldata
        ============================================================
        [0]  [[ 18.21704938   6.59797925   3.54086993   0.26080987]]


    """

    self._scala.matrixSvd(matrix_column_name)

def multiclass_classification_metrics(

self, label_column, pred_column, beta=1.0, frequency_column=None)

Statistics of accuracy, precision, and others for a multi-class classification model.

Parameters:

label_column

(str):

The name of the column containing the correct label for each instance.

pred_column

(str):

The name of the column containing the predicted label for each instance.

beta (Optional[int]): This is the beta value to use for :math:F_{ eta} measure (default F1 measure is computed); must be greater than zero. Defaults is 1.

frequency_column

(Optional[str]):

The name of an optional column containing the frequency of observations.

Returns

(ClassificationMetricsValue):

The data returned is composed of multiple components:
<object>.accuracy : double
<object>.confusion_matrix : table
<object>.f_measure : double
<object>.precision : double
<object>.recall : double

Calculate the accuracy, precision, confusion_matrix, recall and :math:F_{ eta} measure for a classification model.

The f_measure result is the :math:F_{ eta} measure for a classification model. The :math:F_{ eta} measure of a binary classification model is the harmonic mean of precision and recall. If we let:
- beta :math:\equiv eta,
- :math:T_{P} denotes the number of true positives,
- :math:F_{P} denotes the number of false positives, and
- :math:F_{N} denotes the number of false negatives
then:

.. math::
```
F_{ eta} = (1 + eta ^ 2) * rac{ rac{T_{P}}{T_{P} + F_{P}} *            rac{T_{P}}{T_{P} + F_{N}}}{ eta ^ 2 * rac{T_{P}}{T_{P} +            F_{P}}  + rac{T_{P}}{T_{P} + F_{N}}}
```
The :math:F_{ eta} measure for a multi-class classification model is computed as the weighted average of the :math:F_{ eta} measure for each label, where the weight is the number of instances of each label. The determination of binary vs. multi-class is automatically inferred from the data.
For multi-class classification models, the recall measure is computed as the weighted average of the recall for each label, where the weight is the number of instances of each label. The determination of binary vs. multi-class is automatically inferred from the data.
For multi-class classification models, the precision measure is computed as the weighted average of the precision for each label, where the weight is the number of instances of each label. The determination of binary vs. multi-class is automatically inferred from the data.
The accuracy of a classification model is the proportion of predictions that are correctly identified. If we let :math:T_{P} denote the number of true positives, :math:T_{N} denote the number of true negatives, and :math:K denote the total number of classified instances, then the model accuracy is given by: :math:rac{T_{P} + T_{N}}{K}.
The confusion_matrix result is a confusion matrix for a classifier model, formatted for human readability.

Examples:

Consider Frame my_frame, which contains the data

>>> my_frame.inspect()
 [#]  a      b  labels  predictions
 ==================================
 [0]  red    1       0            0
 [1]  blue   3       1            0
 [2]  green  1       0            0
 [3]  green  0       1            1
 [4]  red    0       5            4

>>> cm = my_frame.multiclass_classification_metrics('labels', 'predictions')
[===Job Progress===]

>>> cm.f_measure
0.5866666666666667

>>> cm.recall
0.6

>>> cm.accuracy
0.6

>>> cm.precision
0.6666666666666666

>>> cm.confusion_matrix
                  Predicted_0  Predicted_1  Predicted_4
 Actual_0            2            0            0
 Actual_1            1            1            0
 Actual_5            0            0            1

Show source ≡

def multiclass_classification_metrics(self, label_column, pred_column, beta=1.0, frequency_column=None):
    """
    Statistics of accuracy, precision, and others for a multi-class classification model.

    Parameters:

    :param label_column: (str) The name of the column containing the correct label for each instance.
    :param pred_column: (str) The name of the column containing the predicted label for each instance.
    :param beta: (Optional[int]) This is the beta value to use for :math:`F_{ \beta}` measure (default F1 measure is computed);
                must be greater than zero. Defaults is 1.
    :param frequency_column: (Optional[str]) The name of an optional column containing the frequency of observations.
    :return: (ClassificationMetricsValue) The data returned is composed of multiple components:

            <object>.accuracy : double

            <object>.confusion_matrix : table

            <object>.f_measure : double

            <object>.precision : double

            <object>.recall : double

   Calculate the accuracy, precision, confusion_matrix, recall and :math:`F_{ \beta}` measure for a
   classification model.

   *   The **f_measure** result is the :math:`F_{ \beta}` measure for a
       classification model.
       The :math:`F_{ \beta}` measure of a binary classification model is the
       harmonic mean of precision and recall.
       If we let:

       * beta :math:`\equiv \beta`,
       * :math:`T_{P}` denotes the number of true positives,
       * :math:`F_{P}` denotes the number of false positives, and
       * :math:`F_{N}` denotes the number of false negatives

       then:

       .. math::

           F_{ \beta} = (1 + \beta ^ 2) * \frac{ \frac{T_{P}}{T_{P} + F_{P}} * \
           \frac{T_{P}}{T_{P} + F_{N}}}{ \beta ^ 2 * \frac{T_{P}}{T_{P} + \
           F_{P}}  + \frac{T_{P}}{T_{P} + F_{N}}}

       The :math:`F_{ \beta}` measure for a multi-class classification model is
       computed as the weighted average of the :math:`F_{ \beta}` measure for
       each label, where the weight is the number of instances of each label.
       The determination of binary vs. multi-class is automatically inferred
       from the data.

   *   For multi-class classification models, the **recall** measure is computed as
       the weighted average of the recall for each label, where the weight is
       the number of instances of each label.
       The determination of binary vs. multi-class is automatically inferred
       from the data.

   *   For multi-class classification models, the **precision** measure is computed
       as the weighted average of the precision for each label, where the
       weight is the number of instances of each label.
       The determination of binary vs. multi-class is automatically inferred
       from the data.

   *   The **accuracy** of a classification model is the proportion of
       predictions that are correctly identified.
       If we let :math:`T_{P}` denote the number of true positives,
       :math:`T_{N}` denote the number of true negatives, and :math:`K` denote
       the total number of classified instances, then the model accuracy is
       given by: :math:`\frac{T_{P} + T_{N}}{K}`.

   *   The **confusion_matrix** result is a confusion matrix for a
       classifier model, formatted for human readability.

   Examples
   --------
   Consider Frame *my_frame*, which contains the data

       >>> my_frame.inspect()
        [#]  a      b  labels  predictions
        ==================================
        [0]  red    1       0            0
        [1]  blue   3       1            0
        [2]  green  1       0            0
        [3]  green  0       1            1
        [4]  red    0       5            4

       >>> cm = my_frame.multiclass_classification_metrics('labels', 'predictions')
       [===Job Progress===]

       >>> cm.f_measure
       0.5866666666666667

       >>> cm.recall
       0.6

       >>> cm.accuracy
       0.6

       >>> cm.precision
       0.6666666666666666

       >>> cm.confusion_matrix
                         Predicted_0  Predicted_1  Predicted_4
        Actual_0            2            0            0
        Actual_1            1            1            0
        Actual_5            0            0            1

    """
    return ClassificationMetricsValue(self._tc, self._scala.multiClassClassificationMetrics(label_column,
                                      pred_column,
                                      float(beta),
                                      self._tc.jutils.convert.to_scala_option(frequency_column)))

def power_iteration_clustering(

self, source_column, destination_column, similarity_column, k=2, max_iterations=100, initialization_mode='random')

Power Iteration Clustering finds a low-dimensional embedding of a dataset using truncated power iteration on a normalized pair-wise similarity matrix of the data.

Parameters:

source_column

(str):

Name of the column containing the source node

destination_column

(str):

Name of the column containing the destination node

similarity_column

(str):

Name of the column containing the similarity

(Optional(int)):

Number of clusters to cluster the graph into. Default is 2

max_iterations

(Optional(int)):

Maximum number of iterations of the power iteration loop. Default is 100

initialization_mode

(Optional(str)):

Initialization mode of power iteration clustering. This can be either "random" to use a random vector as vertex properties, or "degree" to use normalized sum similarities. Default is "random".

Returns

(namedtuple):

Returns namedtuple containing the results frame(node and cluster), k (number of clusters), and cluster_sizes(a map of clusters and respective size)

Example:

>>> frame = tc.frame.create([[1,2,1.0],
...                         [1,3,0.3],
...                         [2,3,0.3],
...                         [3,0,0.03],
...                         [0,5,0.01],
...                         [5,4,0.3],
...                         [5,6,1.0],
...                         [4,6,0.3]],
...                         [('Source', int), ('Destination', int), ('Similarity',float)])

>>> frame.inspect()
[#]  Source  Destination  Similarity
====================================
[0]       1            2         1.0
[1]       1            3         0.3
[2]       2            3         0.3
[3]       3            0        0.03
[4]       0            5        0.01
[5]       5            4         0.3
[6]       5            6         1.0
[7]       4            6         0.3

>>> x = frame.power_iteration_clustering('Source', 'Destination', 'Similarity', k=3)

>>> x.frame.inspect()
[#]  id  cluster
================
[0]   4        2
[1]   0        3
[2]   6        2
[3]   2        1
[4]   1        1
[5]   3        1
[6]   5        2

>>> x.k
3
>>> x.cluster_sizes
{u'2': 3, u'3': 1, u'1': 3}

Show source ≡

def power_iteration_clustering(self, source_column, destination_column, similarity_column, k=2, max_iterations=100,
                               initialization_mode = "random"):
    """
    Power Iteration Clustering finds a low-dimensional embedding of a dataset using truncated power iteration on a
    normalized pair-wise similarity matrix of the data.

    Parameters
    ----------

    :param source_column: (str) Name of the column containing the source node
    :param destination_column: (str) Name of the column containing the destination node
    :param similarity_column: (str) Name of the column containing the similarity
    :param k: (Optional(int)) Number of clusters to cluster the graph into. Default is 2
    :param max_iterations: (Optional(int)) Maximum number of iterations of the power iteration loop. Default is 100
    :param initialization_mode: (Optional(str)) Initialization mode of power iteration clustering. This can be either
     "random" to use a random vector as vertex properties, or "degree" to use normalized sum similarities. Default is "random".
    :return: (namedtuple) Returns namedtuple containing the results frame(node and cluster), k (number of clusters),
     and cluster_sizes(a map of clusters and respective size)

    Example
    -------

        >>> frame = tc.frame.create([[1,2,1.0],
        ...                         [1,3,0.3],
        ...                         [2,3,0.3],
        ...                         [3,0,0.03],
        ...                         [0,5,0.01],
        ...                         [5,4,0.3],
        ...                         [5,6,1.0],
        ...                         [4,6,0.3]],
        ...                         [('Source', int), ('Destination', int), ('Similarity',float)])

        >>> frame.inspect()
        [#]  Source  Destination  Similarity
        ====================================
        [0]       1            2         1.0
        [1]       1            3         0.3
        [2]       2            3         0.3
        [3]       3            0        0.03
        [4]       0            5        0.01
        [5]       5            4         0.3
        [6]       5            6         1.0
        [7]       4            6         0.3

        >>> x = frame.power_iteration_clustering('Source', 'Destination', 'Similarity', k=3)

        >>> x.frame.inspect()
        [#]  id  cluster
        ================
        [0]   4        2
        [1]   0        3
        [2]   6        2
        [3]   2        1
        [4]   1        1
        [5]   3        1
        [6]   5        2

        >>> x.k
        3
        >>> x.cluster_sizes
        {u'2': 3, u'3': 1, u'1': 3}

    """
    result = self._scala.powerIterationClustering(source_column,
                                                  destination_column,
                                                  similarity_column,
                                                  k,
                                                  max_iterations,
                                                  initialization_mode)
    k_val = result.k()
    cluster_sizes = self._tc.jutils.convert.scala_map_to_python(result.clusterSizes())
    from sparktk.frame.frame import Frame
    py_frame = Frame(self._tc, result.clusterMapFrame())
    return PicResult(frame=py_frame, k=k_val, cluster_sizes=cluster_sizes)

def quantile_bin_column(

self, column_name, num_bins=None, bin_column_name=None)

Classify column into groups with the same frequency.

Group rows of data based on the value in a single column and add a label to identify grouping.

Equal depth binning attempts to label rows such that each bin contains the same number of elements. For :math:n bins of a column :math:C of length :math:m, the bin number is determined by:

.. math::

\lceil n * rac { f(C) }{ m }

ceil

where :math:f is a tie-adjusted ranking function over values of :math:C. If there are multiples of the same value in :math:C, then their tie-adjusted rank is the average of their ordered rank values.

Notes:

The num_bins parameter is considered to be the maximum permissible number of bins because the data may dictate fewer bins. For example, if the column to be binned has a quantity of :math"X elements with only 2 distinct values and the num_bins parameter is greater than 2, then the actual number of bins will only be 2. This is due to a restriction that elements with an identical value must belong to the same bin.

Parameters:

column_name

(str):

The column whose values are to be binned.

num_bins

(Optional[int]):

The maximum number of quantiles. Default is the Square-root choice :math:`\lfloor \sqrt{m} floor`, where :math:`m` is the number of rows.

bin_column_name

(Optional[str]):

The name for the new column holding the grouping labels. Default is _binned

Returns

(List[float]):

A list containing the edges of each bin

Examples:

Given a frame with column a accessed by a Frame object my_frame:

>>> my_frame.inspect( n=11 )
[##]  a
========
[0]    1
[1]    1
[2]    2
[3]    3
[4]    5
[5]    8
[6]   13
[7]   21
[8]   34
[9]   55
[10]  89

Modify the frame, adding a column showing what bin the data is in. The data should be grouped into a maximum of five bins. Note that each bin will have the same quantity of members (as much as possible):

>>> cutoffs = my_frame.quantile_bin_column('a', 5, 'aEDBinned')
[===Job Progress===]

>>> my_frame.inspect( n=11 )
[##]  a   aEDBinned
===================
[0]    1          0
[1]    1          0
[2]    2          1
[3]    3          1
[4]    5          2
[5]    8          2
[6]   13          3
[7]   21          3
[8]   34          4
[9]   55          4
[10]  89          4

>>> print cutoffs
[1.0, 2.0, 5.0, 13.0, 34.0, 89.0]

Show source ≡

def quantile_bin_column(self, column_name, num_bins=None, bin_column_name=None):
    """
    Classify column into groups with the same frequency.

    Group rows of data based on the value in a single column and add a label
    to identify grouping.

    Equal depth binning attempts to label rows such that each bin contains the
    same number of elements.
    For :math:`n` bins of a column :math:`C` of length :math:`m`, the bin
    number is determined by:

    .. math::

        \lceil n * \frac { f(C) }{ m } \rceil

    where :math:`f` is a tie-adjusted ranking function over values of
    :math:`C`.
    If there are multiples of the same value in :math:`C`, then their
    tie-adjusted rank is the average of their ordered rank values.

    Notes
    -----

    1.  The num_bins parameter is considered to be the maximum permissible number
        of bins because the data may dictate fewer bins.
        For example, if the column to be binned has a quantity of :math"`X`
        elements with only 2 distinct values and the *num_bins* parameter is
        greater than 2, then the actual number of bins will only be 2.
        This is due to a restriction that elements with an identical value must
        belong to the same bin.

    Parameters
    ----------

    :param column_name: (str) The column whose values are to be binned.
    :param num_bins: (Optional[int]) The maximum number of quantiles.
                     Default is the Square-root choice
                     :math:`\lfloor \sqrt{m} \rfloor`, where :math:`m` is the number of rows.
    :param bin_column_name: (Optional[str]) The name for the new column holding the grouping labels.
                            Default is _binned
    :return: (List[float]) A list containing the edges of each bin

    Examples
    --------
    Given a frame with column *a* accessed by a Frame object *my_frame*:


        >>> my_frame.inspect( n=11 )
        [##]  a
        ========
        [0]    1
        [1]    1
        [2]    2
        [3]    3
        [4]    5
        [5]    8
        [6]   13
        [7]   21
        [8]   34
        [9]   55
        [10]  89


    Modify the frame, adding a column showing what bin the data is in.
    The data should be grouped into a maximum of five bins.
    Note that each bin will have the same quantity of members (as much as
    possible):

        >>> cutoffs = my_frame.quantile_bin_column('a', 5, 'aEDBinned')
        [===Job Progress===]

        >>> my_frame.inspect( n=11 )
        [##]  a   aEDBinned
        ===================
        [0]    1          0
        [1]    1          0
        [2]    2          1
        [3]    3          1
        [4]    5          2
        [5]    8          2
        [6]   13          3
        [7]   21          3
        [8]   34          4
        [9]   55          4
        [10]  89          4

        >>> print cutoffs
        [1.0, 2.0, 5.0, 13.0, 34.0, 89.0]

    """
    return self._tc.jutils.convert.from_scala_seq(self._scala.quantileBinColumn(column_name,
                                                  self._tc.jutils.convert.to_scala_option(num_bins),
                                                  self._tc.jutils.convert.to_scala_option(bin_column_name)))

def quantiles(

self, column_name, quantiles)

Returns a new frame with Quantiles and their values.

Parameters:

column_name

(str):

The column to calculate quantiles on

quantiles

(List[float]):

The quantiles being requested

Returns

(Frame):

A new frame with two columns (float64): requested Quantiles and their respective values.

Calculates quantiles on the given column.

Examples:

Consider Frame my_frame, which accesses a frame that contains a single column final_sale_price:

>>> my_frame.inspect()
[#]  final_sale_price
=====================
[0]               100
[1]               250
[2]                95
[3]               179
[4]               315
[5]               660
[6]               540
[7]               420
[8]               250
[9]               335

To calculate 10th, 50th, and 100th quantile:

>>> quantiles_frame = my_frame.quantiles('final_sale_price', [10, 50, 100])
[===Job Progress===]

A new Frame containing the requested Quantiles and their respective values will be returned:

quantiles_frame.inspect() [#] Quantiles final_sale_price_QuantileValue ============================================== [0] 10.0 95.0 [1] 50.0 250.0 [2] 100.0 660.0

Show source ≡

def quantiles(self, column_name, quantiles):
    """
    Returns a new frame with Quantiles and their values.

    Parameters
    ----------

    :param column_name: (str) The column to calculate quantiles on
    :param quantiles: (List[float]) The quantiles being requested
    :return: (Frame) A new frame with two columns (float64): requested Quantiles and their respective values.

    Calculates quantiles on the given column.

    Examples
    --------

    Consider Frame *my_frame*, which accesses a frame that contains a single
    column *final_sale_price*:

        >>> my_frame.inspect()
        [#]  final_sale_price
        =====================
        [0]               100
        [1]               250
        [2]                95
        [3]               179
        [4]               315
        [5]               660
        [6]               540
        [7]               420
        [8]               250
        [9]               335

    To calculate 10th, 50th, and 100th quantile:

        >>> quantiles_frame = my_frame.quantiles('final_sale_price', [10, 50, 100])
        [===Job Progress===]

    A new Frame containing the requested Quantiles and their respective values
    will be returned:

       >>> quantiles_frame.inspect()
       [#]  Quantiles  final_sale_price_QuantileValue
       ==============================================
       [0]       10.0                            95.0
       [1]       50.0                           250.0
       [2]      100.0                           660.0

    """
    from sparktk.frame.frame import Frame
    return Frame(self._tc, self._scala.quantiles(column_name, self._tc.jutils.convert.to_scala_list_double(quantiles)))

def rename_columns(

self, names)

Rename columns

Parameters:

names

(dict):

Dictionary of old names to new names.

Examples:

Start with a frame with columns Black and White.

>>> print my_frame.schema
[('Black', <type 'unicode'>), ('White', <type 'unicode'>)]

Rename the columns to Mercury and Venus:

>>> my_frame.rename_columns({"Black": "Mercury", "White": "Venus"})

>>> print my_frame.schema
[(u'Mercury', <type 'unicode'>), (u'Venus', <type 'unicode'>)]

Show source ≡

def rename_columns(self, names):
    """
    Rename columns

    Parameters
    ----------

    :param names: (dict) Dictionary of old names to new names.

    Examples
    --------
    Start with a frame with columns *Black* and *White*.


        >>> print my_frame.schema
        [('Black', ), ('White', )]

    Rename the columns to *Mercury* and *Venus*:

        >>> my_frame.rename_columns({"Black": "Mercury", "White": "Venus"})

        >>> print my_frame.schema
        [(u'Mercury', ), (u'Venus', )]

    """
    if not isinstance(names, dict):
        raise ValueError("Unsupported 'names' parameter type.  Expected dictionary, but found %s." % type(names))
    if self.schema is None:
        raise RuntimeError("Unable rename column(s), because the frame's schema has not been defined.")
    if self._is_python:
        scala_rename_map = self._tc.jutils.convert.to_scala_map(names)
        scala_schema = schema_to_scala(self._tc.sc, self._python.schema)
        rename_scala_schema = scala_schema.renameColumns(scala_rename_map)
        self._python.schema = schema_to_python(self._tc.sc, rename_scala_schema)
    else:
        self._scala.renameColumns(self._tc.jutils.convert.to_scala_map(names))

def reverse_box_cox(

self, column_name, lambda_value=0.0, reverse_box_cox_column_name=None)

Calculate the reverse box-cox transformation for each row on a given column_name of the current frame

Parameters:

column_name:

Name of the column to perform the reverse transformation on

lambda_value:

Lambda power parameter. Default is 0.0

reverse_box_cox_column_name:

Optional column name for the reverse box cox value

Returns

(Frame):

returns a frame with a new column storing the reverse box-cox transformed value

Calculate the reverse box-cox transformation for each row in column 'column_name' of a frame using the lambda_value.

Reverse Box-cox transformation is computed by the following formula:

reverse_box_cox = exp(boxcox); if lambda=0, reverse_box_cox = (lambda * boxcox + 1)^(1/lambda) ; else

Examples:

>>> data = [[7.7132064326674596, 2.81913279907],[0.207519493594015, -1.25365381375],[6.336482349262754, 2.46673638752], [7.4880388253861181, 2.76469126003],[4.9850701230259045, 2.06401101556]]
>>> schema = [("input", float), ("input_lambda_0.3", float)]
>>> my_frame = tc.frame.create(data, schema)
>>> my_frame.inspect()
[#]  input           input_lambda_0.3
=====================================
[0]   7.71320643267     2.81913279907
[1]  0.207519493594    -1.25365381375
[2]   6.33648234926     2.46673638752
[3]   7.48803882539     2.76469126003
[4]   4.98507012303     2.06401101556

Compute the reverse box-cox transformation on the 'input_lambda_0.3' column which stores the box-cox transformed
value on column 'input' with lambda 0.3
>>> my_frame.reverse_box_cox('input_lambda_0.3',0.3)

A new column gets added to the frame which stores the reverse box-cox transformation for each row.
This value is equal to the original vales in 'input' column
>>> my_frame.inspect()
[#]  input           input_lambda_0.3  input_lambda_0.3_reverse_lambda_0.3
==========================================================================
[0]   7.71320643267     2.81913279907                        7.71320643267
[1]  0.207519493594    -1.25365381375                       0.207519493594
[2]   6.33648234926     2.46673638752                        6.33648234926
[3]   7.48803882539     2.76469126003                         7.4880388254
[4]   4.98507012303     2.06401101556                        4.98507012301

Show source ≡

def reverse_box_cox(self, column_name, lambda_value=0.0, reverse_box_cox_column_name= None):
    """
    Calculate the reverse box-cox transformation for each row on a given column_name of the current frame

    Parameters
    ----------

    :param column_name: Name of the column to perform the reverse transformation on
    :param lambda_value: Lambda power parameter. Default is 0.0
    :param reverse_box_cox_column_name: Optional column name for the reverse box cox value
    :return: (Frame) returns a frame with a new column storing the reverse box-cox transformed value

    Calculate the reverse box-cox transformation for each row in column 'column_name' of a frame using the lambda_value.

    Reverse Box-cox transformation is computed by the following formula:

    reverse_box_cox = exp(boxcox); if lambda=0,
    reverse_box_cox = (lambda * boxcox + 1)^(1/lambda) ; else

    Examples
    --------

        >>> data = [[7.7132064326674596, 2.81913279907],[0.207519493594015, -1.25365381375],[6.336482349262754, 2.46673638752], [7.4880388253861181, 2.76469126003],[4.9850701230259045, 2.06401101556]]
        >>> schema = [("input", float), ("input_lambda_0.3", float)]
        >>> my_frame = tc.frame.create(data, schema)
        >>> my_frame.inspect()
        [#]  input           input_lambda_0.3
        =====================================
        [0]   7.71320643267     2.81913279907
        [1]  0.207519493594    -1.25365381375
        [2]   6.33648234926     2.46673638752
        [3]   7.48803882539     2.76469126003
        [4]   4.98507012303     2.06401101556

        Compute the reverse box-cox transformation on the 'input_lambda_0.3' column which stores the box-cox transformed
        value on column 'input' with lambda 0.3
        >>> my_frame.reverse_box_cox('input_lambda_0.3',0.3)

        A new column gets added to the frame which stores the reverse box-cox transformation for each row.
        This value is equal to the original vales in 'input' column
        >>> my_frame.inspect()
        [#]  input           input_lambda_0.3  input_lambda_0.3_reverse_lambda_0.3
        ==========================================================================
        [0]   7.71320643267     2.81913279907                        7.71320643267
        [1]  0.207519493594    -1.25365381375                       0.207519493594
        [2]   6.33648234926     2.46673638752                        6.33648234926
        [3]   7.48803882539     2.76469126003                         7.4880388254
        [4]   4.98507012303     2.06401101556                        4.98507012301

    """

    self._scala.reverseBoxCox(column_name, lambda_value, self._tc.jutils.convert.to_scala_option(reverse_box_cox_column_name))

def save(

self, path)

Persists the frame to the given file path

Show source ≡

def save(self, path):
    """Persists the frame to the given file path"""
    self._scala.save(path)

def sort(

self, columns, ascending=True)

Sort by one or more columns.

Parameters:

columns

(str or List[str]):

Either a column name, list of column names, or list of tuples where each tuple is a name and an ascending bool value.

ascending

(Optional[bool]):

True for ascending (default), or False for descending.

Sort a frame by column values either ascending or descending.

Examples:

Consider the frame:

>>> frame.inspect()
[#]  col1  col2
==================
[0]     3  foxtrot
[1]     1  charlie
[2]     3  bravo
[3]     2  echo
[4]     4  delta
[5]     3  alpha

Sort a single column:

>>> frame.sort('col1')
[===Job Progress===]
>>> frame.inspect()
[#]  col1  col2
==================
[0]     1  charlie
[1]     2  echo
[2]     3  foxtrot
[3]     3  bravo
[4]     3  alpha
[5]     4  delta

Sort a single column descending:

>>> frame.sort('col2', False)
[===Job Progress===]
>>> frame.inspect()
[#]  col1  col2
==================
[0]     3  foxtrot
[1]     2  echo
[2]     4  delta
[3]     1  charlie
[4]     3  bravo
[5]     3  alpha

Sort multiple columns:

>>> frame.sort(['col1', 'col2'])
[===Job Progress===]

>>> frame.inspect()
[#]  col1  col2
==================
[0]     1  charlie
[1]     2  echo
[2]     3  alpha
[3]     3  bravo
[4]     3  foxtrot
[5]     4  delta

Sort multiple columns descending:

>>> frame.sort(['col1', 'col2'], False)
[===Job Progress===]

>>> frame.inspect()
[#]  col1  col2
==================
[0]     4  delta
[1]     3  foxtrot
[2]     3  bravo
[3]     3  alpha
[4]     2  echo
[5]     1  charlie

Sort multiple columns: 'col1' decending and 'col2' ascending:

>>> frame.sort([ ('col1', False), ('col2', True) ])
[===Job Progress===]

>>> frame.inspect()
[#]  col1  col2
==================
[0]     4  delta
[1]     3  alpha
[2]     3  bravo
[3]     3  foxtrot
[4]     2  echo
[5]     1  charlie

Show source ≡

def sort(self, columns, ascending=True):
    """
    Sort by one or more columns.

    Parameters
    ----------

    :param columns: (str or List[str]) Either a column name, list of column names, or list of tuples where each tuple is a name and an
                    ascending bool value.
    :param ascending: (Optional[bool]) True for ascending (default), or False for descending.

    Sort a frame by column values either ascending or descending.

    Examples
    --------


    Consider the frame:

        >>> frame.inspect()
        [#]  col1  col2
        ==================
        [0]     3  foxtrot
        [1]     1  charlie
        [2]     3  bravo
        [3]     2  echo
        [4]     4  delta
        [5]     3  alpha

    Sort a single column:

        >>> frame.sort('col1')
        [===Job Progress===]
        >>> frame.inspect()
        [#]  col1  col2
        ==================
        [0]     1  charlie
        [1]     2  echo
        [2]     3  foxtrot
        [3]     3  bravo
        [4]     3  alpha
        [5]     4  delta

    Sort a single column descending:

        >>> frame.sort('col2', False)
        [===Job Progress===]
        >>> frame.inspect()
        [#]  col1  col2
        ==================
        [0]     3  foxtrot
        [1]     2  echo
        [2]     4  delta
        [3]     1  charlie
        [4]     3  bravo
        [5]     3  alpha

    Sort multiple columns:

        >>> frame.sort(['col1', 'col2'])
        [===Job Progress===]

        >>> frame.inspect()
        [#]  col1  col2
        ==================
        [0]     1  charlie
        [1]     2  echo
        [2]     3  alpha
        [3]     3  bravo
        [4]     3  foxtrot
        [5]     4  delta


    Sort multiple columns descending:

        >>> frame.sort(['col1', 'col2'], False)
        [===Job Progress===]

        >>> frame.inspect()
        [#]  col1  col2
        ==================
        [0]     4  delta
        [1]     3  foxtrot
        [2]     3  bravo
        [3]     3  alpha
        [4]     2  echo
        [5]     1  charlie

    Sort multiple columns: 'col1' decending and 'col2' ascending:

        >>> frame.sort([ ('col1', False), ('col2', True) ])
        [===Job Progress===]

        >>> frame.inspect()
        [#]  col1  col2
        ==================
        [0]     4  delta
        [1]     3  alpha
        [2]     3  bravo
        [3]     3  foxtrot
        [4]     2  echo
        [5]     1  charlie

    """
    if columns is None:
        raise ValueError("The columns parameter should not be None.")
    elif not isinstance(columns, list):
        columns = [columns]
    if not columns:
        raise ValueError("The columns parameter should not be empty.")
    if self._is_scala:
        scala_sort(self, columns, ascending)
    else:
        column_names = columns              # list of column names
        columns_ascending = ascending       # boolean summarizing if we are sorting ascending or descending

        if isinstance(columns[0], tuple):
            are_all_proper_tuples = all(isinstance(c, tuple) and isinstance(c[0], basestring) and isinstance(c[1], bool) for c in columns)

            if not are_all_proper_tuples:
                raise ValueError("If the columns paramter is a list of tuples, each tuple must have a string (column name)"
                                 "and a bool (True for ascending).")

            column_names = [c[0] for c in columns]  # Grab just the column names from the list of tuples

            # Check ascending booleans in the tuples to see if they're all the same
            are_all_same_ascending = all(c[1] == columns[0][1] for c in columns)

            if are_all_same_ascending:
                columns_ascending = columns[0][1]
        else:
            are_all_same_ascending = True

        if are_all_same_ascending:
            indices = sparktk.frame.schema.get_indices_for_selected_columns(self.schema, column_names)
            self._python.rdd = self.rdd.sortBy(lambda x: tuple([x[index] for index in indices]), ascending=columns_ascending)

        else:
            # If there are different ascending values between columns, then use scala sort
            scala_sort(self, columns, ascending)

def sorted_k(

self, k, column_names_and_ascending, reduce_tree_depth=2)

Get a sorted subset of the data.

Parameters:

(int):

Number of sorted records to return.

column_names_and_ascending

(List[tuple(str, bool)]):

Column names to sort by, and true to sort column by ascending order, or false for descending order.

reduce_tree_depth

(int):

Advanced tuning parameter which determines the depth of the reduce-tree (uses Spark's treeReduce() for scalability.) Default is 2.

Returns

(Frame):

A new frame with a subset of sorted rows from the original frame.

Take a number of rows and return them sorted in either ascending or descending order.

Sorting a subset of rows is more efficient than sorting the entire frame when the number of sorted rows is much less than the total number of rows in the frame.

Notes:

The number of sorted rows should be much smaller than the number of rows in the original frame.

In particular:

The number of sorted rows returned should fit in Spark driver memory. The maximum size of serialized results that can fit in the Spark driver is set by the Spark configuration parameter spark.driver.maxResultSize.
If you encounter a Kryo buffer overflow exception, increase the Spark configuration parameter spark.kryoserializer.buffer.max.mb.
Use Frame.sort() instead if the number of sorted rows is very large (in other words, it cannot fit in Spark driver memory).

Examples:

These examples deal with the most recently-released movies in a private collection. Consider the movie collection already stored in the frame below:

>>> my_frame.inspect()
[#]  genre      year  title
========================================================
[0]  Drama      1957  12 Angry Men
[1]  Crime      1946  The Big Sleep
[2]  Western    1969  Butch Cassidy and the Sundance Kid
[3]  Drama      1971  A Clockwork Orange
[4]  Drama      2008  The Dark Knight
[5]  Animation  2013  Frozen
[6]  Drama      1972  The Godfather
[7]  Animation  1994  The Lion King
[8]  Animation  2010  Tangled
[9]  Fantasy    1939  The WOnderful Wizard of Oz

This example returns the top 3 rows sorted by a single column: 'year' descending:

>>> topk_frame = my_frame.sorted_k(3, [ ('year', False) ])
[===Job Progress===]

>>> topk_frame.inspect()
[#]  genre      year  title
=====================================
[0]  Animation  2013  Frozen
[1]  Animation  2010  Tangled
[2]  Drama      2008  The Dark Knight

This example returns the top 5 rows sorted by multiple columns: 'genre' ascending, then 'year' descending:

>>> topk_frame = my_frame.sorted_k(5, [ ('genre', True), ('year', False) ])
[===Job Progress===]

>>> topk_frame.inspect()
[#]  genre      year  title
=====================================
[0]  Animation  2013  Frozen
[1]  Animation  2010  Tangled
[2]  Animation  1994  The Lion King
[3]  Crime      1946  The Big Sleep
[4]  Drama      2008  The Dark Knight

This example returns the top 5 rows sorted by multiple columns: 'genre' ascending, then 'year' ascending. It also illustrates the optional tuning parameter for reduce-tree depth (which does not affect the final result).

>>> topk_frame = my_frame.sorted_k(5, [ ('genre', True), ('year', True) ], reduce_tree_depth=1)
[===Job Progress===]

>>> topk_frame.inspect()
[#]  genre      year  title
===================================
[0]  Animation  1994  The Lion King
[1]  Animation  2010  Tangled
[2]  Animation  2013  Frozen
[3]  Crime      1946  The Big Sleep
[4]  Drama      1957  12 Angry Men

Show source ≡

def sorted_k(self, k, column_names_and_ascending, reduce_tree_depth = 2):
    """
    Get a sorted subset of the data.

    Parameters
    ----------

    :param k: (int) Number of sorted records to return.
    :param column_names_and_ascending: (List[tuple(str, bool)]) Column names to sort by, and true to sort column
                                       by ascending order, or false for descending order.
    :param reduce_tree_depth: (int) Advanced tuning parameter which determines the depth of the
                              reduce-tree (uses Spark's treeReduce() for scalability.)
                              Default is 2.
    :return: (Frame) A new frame with a subset of sorted rows from the original frame.

    Take a number of rows and return them sorted in either ascending or descending order.

    Sorting a subset of rows is more efficient than sorting the entire frame when
    the number of sorted rows is much less than the total number of rows in the frame.

    Notes
    -----

    The number of sorted rows should be much smaller than the number of rows
    in the original frame.

    In particular:

    1.  The number of sorted rows returned should fit in Spark driver memory.
        The maximum size of serialized results that can fit in the Spark driver is
        set by the Spark configuration parameter *spark.driver.maxResultSize*.
    +   If you encounter a Kryo buffer overflow exception, increase the Spark
        configuration parameter *spark.kryoserializer.buffer.max.mb*.
    +   Use Frame.sort() instead if the number of sorted rows is very large (in
        other words, it cannot fit in Spark driver memory).

    Examples
    --------

    These examples deal with the most recently-released movies in a private collection.
    Consider the movie collection already stored in the frame below:


        >>> my_frame.inspect()
        [#]  genre      year  title
        ========================================================
        [0]  Drama      1957  12 Angry Men
        [1]  Crime      1946  The Big Sleep
        [2]  Western    1969  Butch Cassidy and the Sundance Kid
        [3]  Drama      1971  A Clockwork Orange
        [4]  Drama      2008  The Dark Knight
        [5]  Animation  2013  Frozen
        [6]  Drama      1972  The Godfather
        [7]  Animation  1994  The Lion King
        [8]  Animation  2010  Tangled
        [9]  Fantasy    1939  The WOnderful Wizard of Oz


    This example returns the top 3 rows sorted by a single column: 'year' descending:

        >>> topk_frame = my_frame.sorted_k(3, [ ('year', False) ])
        [===Job Progress===]

        >>> topk_frame.inspect()
        [#]  genre      year  title
        =====================================
        [0]  Animation  2013  Frozen
        [1]  Animation  2010  Tangled
        [2]  Drama      2008  The Dark Knight

    This example returns the top 5 rows sorted by multiple columns: 'genre' ascending, then 'year' descending:

        >>> topk_frame = my_frame.sorted_k(5, [ ('genre', True), ('year', False) ])
        [===Job Progress===]

        >>> topk_frame.inspect()
        [#]  genre      year  title
        =====================================
        [0]  Animation  2013  Frozen
        [1]  Animation  2010  Tangled
        [2]  Animation  1994  The Lion King
        [3]  Crime      1946  The Big Sleep
        [4]  Drama      2008  The Dark Knight

    This example returns the top 5 rows sorted by multiple columns: 'genre'
    ascending, then 'year' ascending.
    It also illustrates the optional tuning parameter for reduce-tree depth
    (which does not affect the final result).

        >>> topk_frame = my_frame.sorted_k(5, [ ('genre', True), ('year', True) ], reduce_tree_depth=1)
        [===Job Progress===]

        >>> topk_frame.inspect()
        [#]  genre      year  title
        ===================================
        [0]  Animation  1994  The Lion King
        [1]  Animation  2010  Tangled
        [2]  Animation  2013  Frozen
        [3]  Crime      1946  The Big Sleep
        [4]  Drama      1957  12 Angry Men

    """
    from sparktk.frame.frame import Frame
    return Frame(self._tc,
                 self._scala.sortedK(k,
                                     self._tc.jutils.convert.to_scala_list_string_bool_tuple(column_names_and_ascending),
                                     reduce_tree_depth))

def take(

self, n, offset=0, columns=None)

Get data subset.

Take a subset of the currently active Frame.

(See 'collect' operation to simply get all the data from the Frame)

Parameters:

(int):

The number of rows to get from the frame (warning: do not overwhelm the python session by taking too much)

offset

(Optional[int]):

The number of rows to skip before starting to copy.

columns

(Optional[str or list[str]):

If not None, only the given columns' data will be provided. By default, all columns are included.

Returns

(list[list[data]]):

raw frame data

Examples:

Consider the following frame: >>> frame.inspect() [#] name age tenure phone ==================================== [0] Fred 39 16 555-1234 [1] Susan 33 3 555-0202 [2] Thurston 65 26 555-4510 [3] Judy 44 14 555-2183

Use take to get the first two rows and look at the schema and data in the result:

>>> frame.take(2)
[['Fred', 39, 16, '555-1234'], ['Susan', 33, 3, '555-0202']]

Limit the columns in our result to just the name and age column:

>>> frame.take(2, columns=['name', 'age'])
[['Fred', 39], ['Susan', 33]]

Show source ≡

def take(self, n, offset=0, columns=None):
    """
    Get data subset.

    Take a subset of the currently active Frame.

    (See 'collect' operation to simply get all the data from the Frame)

    Parameters
    ----------

    :param n: (int) The number of rows to get from the frame (warning: do not overwhelm the python session
                    by taking too much)
    :param offset: (Optional[int]) The number of rows to skip before starting to copy.
    :param columns: (Optional[str or list[str]) If not None, only the given columns' data will be provided.
                    By default, all columns are included.
    :return: (list[list[data]]) raw frame data

    Examples
    --------


    Consider the following frame:
        >>> frame.inspect()
        [#]  name      age  tenure  phone
        ====================================
        [0]  Fred       39      16  555-1234
        [1]  Susan      33       3  555-0202
        [2]  Thurston   65      26  555-4510
        [3]  Judy       44      14  555-2183

    Use take to get the first two rows and look at the schema and data in the result:

        >>> frame.take(2)
        [['Fred', 39, 16, '555-1234'], ['Susan', 33, 3, '555-0202']]

    Limit the columns in our result to just the name and age column:

        >>> frame.take(2, columns=['name', 'age'])
        [['Fred', 39], ['Susan', 33]]


    """
    require_type.non_negative_int(n, "n")
    require_type.non_negative_int(offset, "offset")
    if columns is not None:
        columns = affirm_type.list_of_str(columns, "columns")
        if not columns:
            return []

    if self._is_scala:
        scala_data = self._scala.take(n, offset, self._tc.jutils.convert.to_scala_option_list_string(columns))
        schema = get_schema_for_columns(self.schema, columns) if columns else self.schema
        data = TakeCollectHelper.scala_rows_to_python(self._tc, scala_data, schema)
    else:
        require_type.non_negative_int(n, "n")
        if offset:
            data = _take_offset(self, n, offset, columns)
        elif columns:
            select_columns = TakeCollectHelper.get_select_columns_function(self.schema, columns)
            data = self._python.rdd.map(select_columns).take(n)
        else:
            data = self._python.rdd.take(n)
    return data

def tally(

self, sample_col, count_val)

Count number of times a value is seen.

Parameters:

sample_col

(str):

The name of the column from which to compute the cumulative count.

count_val

(str):

The column value to be used for the counts.

A cumulative count is computed by sequentially stepping through the rows, observing the column values and keeping track of the number of times the specified count_value has been seen.

Examples:

Consider Frame my_frame, which accesses a frame that contains a single column named obs:

>>> my_frame.inspect()
[#]  obs
========
[0]    0
[1]    1
[2]    2
[3]    0
[4]    1
[5]    2

The cumulative percent count for column obs is obtained by:

>>> my_frame.tally("obs", "1")
[===Job Progress===]

The Frame my_frame accesses the original frame that now contains two columns, obs that contains the original column values, and obsCumulativePercentCount that contains the cumulative percent count:

>>> my_frame.inspect()
[#]  obs  obs_tally
===================
[0]    0        0.0
[1]    1        1.0
[2]    2        1.0
[3]    0        1.0
[4]    1        2.0
[5]    2        2.0

Show source ≡

def tally(self, sample_col, count_val):
    """
    Count number of times a value is seen.

    Parameters
    ----------

    :param sample_col: (str) The name of the column from which to compute the cumulative count.
    :param count_val: (str) The column value to be used for the counts.

    A cumulative count is computed by sequentially stepping through the rows,
    observing the column values and keeping track of the number of times the specified
    *count_value* has been seen.

    Examples
    --------
    Consider Frame *my_frame*, which accesses a frame that contains a single
    column named *obs*:

        >>> my_frame.inspect()
        [#]  obs
        ========
        [0]    0
        [1]    1
        [2]    2
        [3]    0
        [4]    1
        [5]    2

    The cumulative percent count for column *obs* is obtained by:

        >>> my_frame.tally("obs", "1")
        [===Job Progress===]

    The Frame *my_frame* accesses the original frame that now contains two
    columns, *obs* that contains the original column values, and
    *obsCumulativePercentCount* that contains the cumulative percent count:

        >>> my_frame.inspect()
        [#]  obs  obs_tally
        ===================
        [0]    0        0.0
        [1]    1        1.0
        [2]    2        1.0
        [3]    0        1.0
        [4]    1        2.0
        [5]    2        2.0

    """
    self._scala.tally(sample_col, count_val)

def tally_percent(

self, sample_col, count_val)

Compute a cumulative percent count.

Parameters:

sample_col

(str):

The name of the column from which to compute the cumulative sum.

count_val

(str):

The column value to be used for the counts.

A cumulative percent count is computed by sequentially stepping through the rows, observing the column values and keeping track of the percentage of the total number of times the specified count_value has been seen up to the current value.

Examples:

Consider Frame my_frame, which accesses a frame that contains a single column named obs:

>>> my_frame.inspect()
[#]  obs
========
[0]    0
[1]    1
[2]    2
[3]    0
[4]    1
[5]    2

The cumulative percent count for column obs is obtained by:

>>> my_frame.tally_percent("obs", "1")
[===Job Progress===]

>>> my_frame.inspect()
[#]  obs  obs_tally_percent
===========================
[0]    0                0.0
[1]    1                0.5
[2]    2                0.5
[3]    0                0.5
[4]    1                1.0
[5]    2                1.0

Show source ≡

def tally_percent(self, sample_col, count_val):
    """
    Compute a cumulative percent count.

    Parameters
    ----------

    :param sample_col: (str) The name of the column from which to compute the cumulative sum.
    :param count_val: (str) The column value to be used for the counts.

    A cumulative percent count is computed by sequentially stepping through
    the rows, observing the column values and keeping track of the percentage of the
    total number of times the specified *count_value* has been seen up to
    the current value.

    Examples
    --------
    Consider Frame *my_frame*, which accesses a frame that contains a single
    column named *obs*:

        >>> my_frame.inspect()
        [#]  obs
        ========
        [0]    0
        [1]    1
        [2]    2
        [3]    0
        [4]    1
        [5]    2

    The cumulative percent count for column *obs* is obtained by:

        >>> my_frame.tally_percent("obs", "1")
        [===Job Progress===]

    The Frame *my_frame* accesses the original frame that now contains two
    columns, *obs* that contains the original column values, and
    *obsCumulativePercentCount* that contains the cumulative percent count:

        >>> my_frame.inspect()
        [#]  obs  obs_tally_percent
        ===========================
        [0]    0                0.0
        [1]    1                0.5
        [2]    2                0.5
        [3]    0                0.5
        [4]    1                1.0
        [5]    2                1.0

    """
    self._scala.tallyPercent(sample_col, count_val)

def timeseries_augmented_dickey_fuller_test(

self, ts_column, max_lag, regression='c')

Performs the Augmented Dickey-Fuller (ADF) Test, which tests the null hypothesis of whether a unit root is present in a time series sample. The test statistic that is returned in a negative number. The lower the value, the stronger the rejection of the hypothesis that there is a unit root at some level of confidence.

Parameters:

ts_column

(str):

Name of the column that contains the time series values to use with the ADF test.

max_lag

(int):

The lag order to calculate the test statistic.

regression

(Optional(str)):

The method of regression that was used. Following MacKinnon's notation, this can be "c" for constant, "nc" for no constant, "ct" for constant and trend, and "ctt" for constant, trend, and trend-squared.

Returns

(AugmentedDickeyFullerTestResult):

Object contains the ADF test statistic and p-value.

Example:

Consider the following frame of time series values:

>>> frame.inspect()
[#]  timeseries_values
======================
[0]              3.201
[1]             3.3178
[2]             3.6279
[3]             3.5902
[4]               3.43
[5]             4.0546
[6]             3.7606
[7]             3.1231
[8]             3.2077
[9]             4.3383

Calculate augmented Dickey-Fuller test statistic by giving it the name of the column that has the time series values and the max_lag. The function returns an object that has properties for the p-value and test statistic.

>>> frame.timeseries_augmented_dickey_fuller_test("timeseries_values", 0)
p_value   = 0.0
test_stat = -9.93422373369

Show source ≡

def timeseries_augmented_dickey_fuller_test(self, ts_column, max_lag, regression = "c"):
    """
    Performs the Augmented Dickey-Fuller (ADF) Test, which tests the null hypothesis of whether a unit root is present
    in a time series sample. The test statistic that is returned in a negative number.  The lower the value, the
    stronger the rejection of the hypothesis that there is a unit root at some level of confidence.

    Parameters
    ----------

    :param ts_column: (str) Name of the column that contains the time series values to use with the ADF test.
    :param max_lag: (int) The lag order to calculate the test statistic.
    :param regression: (Optional(str)) The method of regression that was used. Following MacKinnon's notation, this
                       can be "c" for constant, "nc" for no constant, "ct" for constant and trend, and "ctt" for
                       constant, trend, and trend-squared.
    :return: (AugmentedDickeyFullerTestResult) Object contains the ADF test statistic and p-value.

    Example
    -------


    Consider the following frame of time series values:

        >>> frame.inspect()
        [#]  timeseries_values
        ======================
        [0]              3.201
        [1]             3.3178
        [2]             3.6279
        [3]             3.5902
        [4]               3.43
        [5]             4.0546
        [6]             3.7606
        [7]             3.1231
        [8]             3.2077
        [9]             4.3383

    Calculate augmented Dickey-Fuller test statistic by giving it the name of the column that has the time series
    values and the max_lag.  The function returns an object that has properties for the p-value and test statistic.

        >>> frame.timeseries_augmented_dickey_fuller_test("timeseries_values", 0)
        p_value   = 0.0
        test_stat = -9.93422373369


    """

    if not isinstance(ts_column, str):
        raise TypeError("ts_column parameter should be a str")
    if not isinstance(max_lag, int):
        raise TypeError("max_lag parameter should be a int")
    if not isinstance(regression, str):
        raise TypeError("regression parameter should be a str")

    scala_result = self._scala.timeSeriesAugmentedDickeyFullerTest(ts_column, max_lag, regression)
    return AugmentedDickeyFullerTestResult(scala_result)

def timeseries_breusch_godfrey_test(

self, residuals, factors, max_lag)

Calculates the Breusch-Godfrey test statistic for serial correlation.

Parameters:

residuals

(str):

Name of the column that contains residual (y) values

factors

(List[str]):

Name of the column(s) that contain factors (x) values

max_lag

(int):

The lag order to calculate the test statistic.

Returns

(BreuschGodfreyTestResult):

Object contains the Breusch-Godfrey test statistic and p-value.

Example:

Consider the following frame that uses a snippet of air quality and sensor data from:

https://archive.ics.uci.edu/ml/datasets/Air+Quality.

Lichman, M. (2013). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science.

>>> frame.inspect()
[#]  Date        Time      CO_GT  PT08_S1_CO  NMHC_GT  C6H6_GT  Temp
====================================================================
[0]  10/03/2004  18.00.00    2.6        1360      150     11.9  13.6
[1]  10/03/2004  19.00.00    2.0        1292      112      9.4  13.3
[2]  10/03/2004  20.00.00    2.2        1402       88      9.0  11.9
[3]  10/03/2004  21.00.00    2.2        1376       80      9.2  11.0
[4]  10/03/2004  22.00.00    1.6        1272       51      6.5  11.2
[5]  10/03/2004  23.00.00    1.2        1197       38      4.7  11.2
[6]  11/03/2004  00.00.00    1.2        1185       31      3.6  11.3
[7]  11/03/2004  01.00.00    1.0        1136       31      3.3  10.7
[8]  11/03/2004  02.00.00    0.9        1094       24      2.3  10.7
[9]  11/03/2004  03.00.00    0.6        1010       19      1.7  10.3

Calcuate the Breusch-Godfrey test result:

>>> y_column = "Temp"
>>> x_columns = ['CO_GT', 'PT08_S1_CO', 'NMHC_GT', 'C6H6_GT']
>>> max_lag = 1

>>> result = frame.timeseries_breusch_godfrey_test(y_column, x_columns, max_lag)

>>> result
p_value   = 0.00353847462468
test_stat = 8.50666768455

Show source ≡

def timeseries_breusch_godfrey_test(self, residuals, factors, max_lag):
    """
    Calculates the Breusch-Godfrey test statistic for serial correlation.

    Parameters
    ----------

    :param residuals: (str) Name of the column that contains residual (y) values
    :param factors: (List[str]) Name of the column(s) that contain factors (x) values
    :param max_lag: (int) The lag order to calculate the test statistic.
    :return: (BreuschGodfreyTestResult) Object contains the Breusch-Godfrey test statistic and p-value.

    Example
    -------


    Consider the following frame that uses a snippet of air quality and sensor data from:

    https://archive.ics.uci.edu/ml/datasets/Air+Quality.

    Lichman, M. (2013). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml].
    Irvine, CA: University of California, School of Information and Computer Science.

        >>> frame.inspect()
        [#]  Date        Time      CO_GT  PT08_S1_CO  NMHC_GT  C6H6_GT  Temp
        ====================================================================
        [0]  10/03/2004  18.00.00    2.6        1360      150     11.9  13.6
        [1]  10/03/2004  19.00.00    2.0        1292      112      9.4  13.3
        [2]  10/03/2004  20.00.00    2.2        1402       88      9.0  11.9
        [3]  10/03/2004  21.00.00    2.2        1376       80      9.2  11.0
        [4]  10/03/2004  22.00.00    1.6        1272       51      6.5  11.2
        [5]  10/03/2004  23.00.00    1.2        1197       38      4.7  11.2
        [6]  11/03/2004  00.00.00    1.2        1185       31      3.6  11.3
        [7]  11/03/2004  01.00.00    1.0        1136       31      3.3  10.7
        [8]  11/03/2004  02.00.00    0.9        1094       24      2.3  10.7
        [9]  11/03/2004  03.00.00    0.6        1010       19      1.7  10.3


    Calcuate the Breusch-Godfrey test result:

        >>> y_column = "Temp"
        >>> x_columns = ['CO_GT', 'PT08_S1_CO', 'NMHC_GT', 'C6H6_GT']
        >>> max_lag = 1

        >>> result = frame.timeseries_breusch_godfrey_test(y_column, x_columns, max_lag)

        >>> result
        p_value   = 0.00353847462468
        test_stat = 8.50666768455

    """
    if not isinstance(residuals, str):
        raise TypeError("residuals parameter should be a str (column name).")
    if isinstance(factors, str):
        factors = [factors]
    if not isinstance(factors, list):
        raise TypeError("factors parameter should be a list of strings (column names).")
    if not isinstance(max_lag, int):
        raise TypeError("max_lag parameter should be an integer.")

    scala_result = self._scala.timeSeriesBreuschGodfreyTest(residuals,
                                                      self._tc.jutils.convert.to_scala_list_string(factors),
                                                      max_lag)
    return BreuschGodfreyTestResult(scala_result)

def timeseries_breusch_pagan_test(

self, residuals, factors)

Peforms the Breusch-Pagan test for heteroskedasticity.

Parameters:

residuals

(str):

Name of the column that contains residual (y) values

factors

(List[str]):

Name of the column(s) that contain factors (x) values

Returns

(BreuschPaganTestResult):

Object contains the Breusch-Pagan test statistic and p-value.

Example:

Consider the following frame:

>>> frame.inspect()
[#]  AT     V      AP       RH     PE
=========================================
[0]   8.34  40.77  1010.84  90.01  480.48
[1]  23.64  58.49   1011.4   74.2  445.75
[2]  29.74   56.9  1007.15  41.91  438.76
[3]  19.07  49.69  1007.22  76.79  453.09
[4]   11.8  40.66  1017.13   97.2  464.43
[5]  13.97  39.16  1016.05   84.6  470.96
[6]   22.1  71.29   1008.2  75.38  442.35
[7]  14.47  41.76  1021.98  78.41     464
[8]  31.25  69.51  1010.25  36.83  428.77
[9]   6.77  38.18   1017.8  81.13   484.3

Calculate the Bruesh-Pagan test statistic where the "AT" column contains residual values and the other columns are factors:

result = frame.timeseries_breusch_pagan_test("AT",["V","AP","RH","PE"]) [===Job Progress===]

The result contains the test statistic and p-value:

result p_value = 0.000147089380721 test_stat = 22.6741588802

Show source ≡

def timeseries_breusch_pagan_test(self, residuals, factors):
    """
    Peforms the Breusch-Pagan test for heteroskedasticity.

    Parameters
    ----------

    :param residuals: (str) Name of the column that contains residual (y) values
    :param factors: (List[str]) Name of the column(s) that contain factors (x) values
    :return: (BreuschPaganTestResult) Object contains the Breusch-Pagan test statistic and p-value.

    Example
    -------


    Consider the following frame:

        >>> frame.inspect()
        [#]  AT     V      AP       RH     PE
        =========================================
        [0]   8.34  40.77  1010.84  90.01  480.48
        [1]  23.64  58.49   1011.4   74.2  445.75
        [2]  29.74   56.9  1007.15  41.91  438.76
        [3]  19.07  49.69  1007.22  76.79  453.09
        [4]   11.8  40.66  1017.13   97.2  464.43
        [5]  13.97  39.16  1016.05   84.6  470.96
        [6]   22.1  71.29   1008.2  75.38  442.35
        [7]  14.47  41.76  1021.98  78.41     464
        [8]  31.25  69.51  1010.25  36.83  428.77
        [9]   6.77  38.18   1017.8  81.13   484.3

    Calculate the Bruesh-Pagan test statistic where the "AT" column contains residual values and the other columns are
    factors:

    >>> result = frame.timeseries_breusch_pagan_test("AT",["V","AP","RH","PE"])
    [===Job Progress===]

    The result contains the test statistic and p-value:

    >>> result
    p_value   = 0.000147089380721
    test_stat = 22.6741588802

    """
    if not isinstance(residuals, str):
        raise TypeError("residuals parameter should be a str (column name).")
    if isinstance(factors, str):
        factors = [factors]
    if not isinstance(factors, list):
        raise TypeError("factors parameter should be a list of strings (column names).")

    scala_result = self._scala.timeSeriesBreuschPaganTest(residuals,
                                                          self._tc.jutils.convert.to_scala_list_string(factors))
    return BreuschPaganTestResult(scala_result)

def timeseries_durbin_watson_test(

self, residuals)

Computes the Durbin-Watson test statistic used to determine the presence of serial correlation in the residuals. Serial correlation can show a relationship between values separated from each other by a given time lag. A value close to 0.0 gives evidence for positive serial correlation, a value close to 4.0 gives evidence for negative serial correlation, and a value close to 2.0 gives evidence for no serial correlation.

residuals

(str):

Name of the column that contains residual values

Returns:

Durbin-Watson statistics test

Example:

In this example, we have a frame that contains time series values. The inspect command below shows a snippet of what the data looks like:

>>> frame.inspect()
[#]  timeseries_values
======================
[0]              3.201
[1]             3.3178
[2]             3.6279
[3]             3.5902
[4]               3.43
[5]             4.0546
[6]             3.7606
[7]             3.1231
[8]             3.2077
[9]             4.3383

Calculate Durbin-Watson test statistic by giving it the name of the column that has the time series values:

>>> frame.timeseries_durbin_watson_test("timeseries_values")
0.02678674777710402

Show source ≡

def timeseries_durbin_watson_test(self, residuals):
    """
    Computes the Durbin-Watson test statistic used to determine the presence of serial correlation in the residuals.
    Serial correlation can show a relationship between values separated from each other by a given time lag. A value
    close to 0.0 gives evidence for positive serial correlation, a value close to 4.0 gives evidence for negative
    serial correlation, and a value close to 2.0 gives evidence for no serial correlation.

    :param residuals: (str) Name of the column that contains residual values
    :return: Durbin-Watson statistics test

    Example
    -------


    In this example, we have a frame that contains time series values.  The inspect command below shows a snippet of
    what the data looks like:

        >>> frame.inspect()
        [#]  timeseries_values
        ======================
        [0]              3.201
        [1]             3.3178
        [2]             3.6279
        [3]             3.5902
        [4]               3.43
        [5]             4.0546
        [6]             3.7606
        [7]             3.1231
        [8]             3.2077
        [9]             4.3383

    Calculate Durbin-Watson test statistic by giving it the name of the column that has the time series values:

        >>> frame.timeseries_durbin_watson_test("timeseries_values")
        0.02678674777710402

    """

    if not isinstance(residuals, str):
        raise TypeError("residuals should be a str (column name).")

    return self._scala.timeSeriesDurbinWatsonTest(residuals)

def timeseries_from_observations(

self, date_time_index, timestamp_column, key_column, value_column)

Returns a frame that has the observations formatted as a time series.

date_time_index:

List of date/time strings. DateTimeIndex to conform all series to.

timestamp_column:

The name of the column telling when the observation occurred.

key_column:

The name of the column that contains which string key the observation belongs to.

value_column:

The name of the column that contains the observed value.

Returns:

Frame formatted as a time series (with a column for key and a column for the vector of values).

Uses the specified timestamp, key, and value columns and the date/time index provided to format the observations as a time series. The time series frame will have columns for the key and a vector of the observed values that correspond to the date/time index.

Examples:

In this example, we will use a frame of observations of resting heart rate for three individuals over three days. The data is accessed from Frame object called my_frame:

>>> my_frame.inspect(my_frame.count())
[#]  name     date                  resting_heart_rate
======================================================
[0]  Edward   2016-01-01T12:00:00Z                  62
[1]  Stanley  2016-01-01T12:00:00Z                  57
[2]  Edward   2016-01-02T12:00:00Z                  63
[3]  Sarah    2016-01-02T12:00:00Z                  64
[4]  Stanley  2016-01-02T12:00:00Z                  57
[5]  Edward   2016-01-03T12:00:00Z                  62
[6]  Sarah    2016-01-03T12:00:00Z                  64
[7]  Stanley  2016-01-03T12:00:00Z                  56

We then need to create an array that contains the date/time index, which will be used when creating the time series. Since our data is for three days, our date/time index will just contain those three dates:

>>> datetimeindex = ["2016-01-01T12:00:00.000Z","2016-01-02T12:00:00.000Z","2016-01-03T12:00:00.000Z"]

Then we can create our time series frame by specifying our date/time index along with the name of our timestamp column (in this example, it's "date"), key column (in this example, it's "name"), and value column (in this example, it's "resting_heart_rate").

 >>> ts = my_frame.timeseries_from_observations(datetimeindex, "date", "name", "resting_heart_rate")
 [===Job Progress===]

Take a look at the resulting time series frame schema and contents:

 >>> ts.schema
 [(u'name', <type 'unicode'>), (u'resting_heart_rate', vector(3))]

 >>> ts.inspect()
 [#]  name     resting_heart_rate
 ================================
 [0]  Stanley  [57.0, 57.0, 56.0]
 [1]  Edward   [62.0, 63.0, 62.0]
 [2]  Sarah    [None, 64.0, 64.0]

Show source ≡

def timeseries_from_observations(self, date_time_index, timestamp_column, key_column, value_column):
    """
    Returns a frame that has the observations formatted as a time series.

    :param date_time_index: List of date/time strings. DateTimeIndex to conform all series to.
    :param timestamp_column: The name of the column telling when the observation occurred.
    :param key_column: The name of the column that contains which string key the observation belongs to.
    :param value_column: The name of the column that contains the observed value.
    :return: Frame formatted as a time series (with a column for key and a column for the vector of values).

    Uses the specified timestamp, key, and value columns and the date/time index provided to format the observations
    as a time series.  The time series frame will have columns for the key and a vector of the observed values that
    correspond to the date/time index.


    Examples
    --------
    In this example, we will use a frame of observations of resting heart rate for three individuals over three days.
    The data is accessed from Frame object called *my_frame*:


        >>> my_frame.inspect(my_frame.count())
        [#]  name     date                  resting_heart_rate
        ======================================================
        [0]  Edward   2016-01-01T12:00:00Z                  62
        [1]  Stanley  2016-01-01T12:00:00Z                  57
        [2]  Edward   2016-01-02T12:00:00Z                  63
        [3]  Sarah    2016-01-02T12:00:00Z                  64
        [4]  Stanley  2016-01-02T12:00:00Z                  57
        [5]  Edward   2016-01-03T12:00:00Z                  62
        [6]  Sarah    2016-01-03T12:00:00Z                  64
        [7]  Stanley  2016-01-03T12:00:00Z                  56

    We then need to create an array that contains the date/time index,
    which will be used when creating the time series.  Since our data
    is for three days, our date/time index will just contain those
    three dates:

        >>> datetimeindex = ["2016-01-01T12:00:00.000Z","2016-01-02T12:00:00.000Z","2016-01-03T12:00:00.000Z"]

    Then we can create our time series frame by specifying our date/time
    index along with the name of our timestamp column (in this example, it's
     "date"), key column (in this example, it's "name"), and value column (in
    this example, it's "resting_heart_rate").

         >>> ts = my_frame.timeseries_from_observations(datetimeindex, "date", "name", "resting_heart_rate")
         [===Job Progress===]

    Take a look at the resulting time series frame schema and contents:

         >>> ts.schema
         [(u'name', ), (u'resting_heart_rate', vector(3))]

         >>> ts.inspect()
         [#]  name     resting_heart_rate
         ================================
         [0]  Stanley  [57.0, 57.0, 56.0]
         [1]  Edward   [62.0, 63.0, 62.0]
         [2]  Sarah    [None, 64.0, 64.0]



    """
    if not isinstance(date_time_index, list):
        raise TypeError("date_time_index should be a list of date/times")

    scala_date_list = self._tc.jutils.convert.to_scala_date_time_list(date_time_index)
    from sparktk.frame.frame import Frame
    return Frame(self._tc,
                 self._scala.timeSeriesFromObseravations(scala_date_list, timestamp_column, key_column, value_column))

def timeseries_slice(

self, date_time_index, start, end)

Returns a frame split on the specified start and end date/times.

Splits a time series frame on the specified start and end date/times.

date_time_index:

List of date/time strings. DateTimeIndex to conform all series to.

start:

The start date for the slice in the ISO 8601 format, like: yyyy-MM-dd'T'HH:mm:ss.SSSZ

end:	The end date for the slice in the ISO 8601 format, like: yyyy-MM-dd'T'HH:mm:ss.SSSZ

Returns:

Frame that contains a sub-slice of the current frame, based on the specified start/end date/times.

Examples:

For this example, we start with a frame that has already been formatted as a time series. This means that the frame has a string column for key and a vector column that contains a series of the observed values. We must also know the date/time index that corresponds to the time series.

The time series is in a Frame object called ts_frame.

>>> ts_frame.inspect()
[#]  key  series
==================================
[0]  A    [62, 55, 60, 61, 60, 59]
[1]  B    [60, 58, 61, 62, 60, 61]
[2]  C    [69, 68, 68, 70, 71, 69]

Next, we define the date/time index. In this example, it is one day intervals from 2016-01-01 to 2016-01-06:

>>> datetimeindex = ["2016-01-01T12:00:00.000Z","2016-01-02T12:00:00.000Z","2016-01-03T12:00:00.000Z","2016-01-04T12:00:00.000Z","2016-01-05T12:00:00.000Z","2016-01-06T12:00:00.000Z"]

Get a slice of our time series from 2016-01-02 to 2016-01-04:

>>> slice_start = "2016-01-02T12:00:00.000Z"
>>> slice_end = "2016-01-04T12:00:00.000Z"

>>> sliced_frame = ts_frame.timeseries_slice(datetimeindex, slice_start, slice_end)
[===Job Progress===]

Take a look at our sliced time series:

>>> sliced_frame.inspect()
[#]  key  series
============================
[0]  A    [55.0, 60.0, 61.0]
[1]  B    [58.0, 61.0, 62.0]
[2]  C    [68.0, 68.0, 70.0]

Show source ≡

def timeseries_slice(self, date_time_index, start, end):
    """
    Returns a frame split on the specified start and end date/times.

    Splits a time series frame on the specified start and end date/times.

    :param date_time_index: List of date/time strings. DateTimeIndex to conform all series to.
    :param start: The start date for the slice in the ISO 8601 format, like: yyyy-MM-dd'T'HH:mm:ss.SSSZ
    :param end: The end date for the slice in the ISO 8601 format, like: yyyy-MM-dd'T'HH:mm:ss.SSSZ
    :return: Frame that contains a sub-slice of the current frame, based on the specified start/end date/times.

    Examples
    --------
    For this example, we start with a frame that has already been formatted as a time series.
    This means that the frame has a string column for key and a vector column that contains
    a series of the observed values.  We must also know the date/time index that corresponds
    to the time series.

    The time series is in a Frame object called *ts_frame*.


        >>> ts_frame.inspect()
        [#]  key  series
        ==================================
        [0]  A    [62, 55, 60, 61, 60, 59]
        [1]  B    [60, 58, 61, 62, 60, 61]
        [2]  C    [69, 68, 68, 70, 71, 69]

    Next, we define the date/time index.  In this example, it is one day intervals from
    2016-01-01 to 2016-01-06:

        >>> datetimeindex = ["2016-01-01T12:00:00.000Z","2016-01-02T12:00:00.000Z","2016-01-03T12:00:00.000Z","2016-01-04T12:00:00.000Z","2016-01-05T12:00:00.000Z","2016-01-06T12:00:00.000Z"]

    Get a slice of our time series from 2016-01-02 to 2016-01-04:

        >>> slice_start = "2016-01-02T12:00:00.000Z"
        >>> slice_end = "2016-01-04T12:00:00.000Z"

        >>> sliced_frame = ts_frame.timeseries_slice(datetimeindex, slice_start, slice_end)
        [===Job Progress===]

    Take a look at our sliced time series:

        >>> sliced_frame.inspect()
        [#]  key  series
        ============================
        [0]  A    [55.0, 60.0, 61.0]
        [1]  B    [58.0, 61.0, 62.0]
        [2]  C    [68.0, 68.0, 70.0]

    """
    if not isinstance(date_time_index, list):
        raise TypeError("date_time_index should be a list of date/times")
    if not isinstance(start, basestring):
        raise TypeError("start date/time should be a string in the ISO 8601 format")
    if not isinstance(end, basestring):
        raise TypeError("end date/time should be a string in the ISO 8601 format")

    from sparktk.frame.frame import Frame
    return Frame(self._tc,
                 self._scala.timeSeriesSlice(self._tc.jutils.convert.to_scala_date_time_list(date_time_index),
                                             self._tc.jutils.convert.to_scala_date_time(start),
                                             self._tc.jutils.convert.to_scala_date_time(end)))

def to_pandas(

self, n=None, offset=0, columns=None)

Brings data into a local pandas dataframe.

Similar to the 'take' function, but puts the data into a pandas dataframe.

Parameters:

(Optional(int)):

The number of rows to get from the frame (warning: do not overwhelm the python session by taking too much)

offset

(Optional(int)):

The number of rows to skip before copying. Defaults to 0.

columns

(Optional(List[str])):

Column filter. The list of names to be included. Default is all columns.

Returns

(pandas.DataFrame):

A new pandas dataframe object containing the taken frame data.

Examples:

Consider the following spark-tk frame, where we have columns for name and phone number:

>>> frame.inspect()
[#]  name      phone
=======================
[0]  Fred      555-1234
[1]  Susan     555-0202
[2]  Thurston  555-4510
[3]  Judy      555-2183

>>> frame.schema
[('name', <type 'str'>), ('phone', <type 'str'>)]

The frame to_pandas() method is used to get a pandas DataFrame that contains the data from the spark-tk frame. Note that since no parameters are provided when to_pandas() is called, the default values are used for the number of rows, the row offset, and the columns.

>>> pandas_frame = frame.to_pandas()
>>> pandas_frame
       name     phone
0      Fred  555-1234
1     Susan  555-0202
2  Thurston  555-4510
3      Judy  555-2183

Show source ≡

def to_pandas(self, n=None, offset=0, columns=None):
    """
    Brings data into a local pandas dataframe.

    Similar to the 'take' function, but puts the data into a pandas dataframe.

    Parameters
    ----------

    :param n: (Optional(int)) The number of rows to get from the frame (warning: do not overwhelm the python session
                    by taking too much)
    :param offset: (Optional(int)) The number of rows to skip before copying.  Defaults to 0.
    :param columns: (Optional(List[str])) Column filter.  The list of names to be included.  Default is all columns.
    :return: (pandas.DataFrame) A new pandas dataframe object containing the taken frame data.

    Examples
    --------


    Consider the following spark-tk frame, where we have columns for name and phone number:

        >>> frame.inspect()
        [#]  name      phone
        =======================
        [0]  Fred      555-1234
        [1]  Susan     555-0202
        [2]  Thurston  555-4510
        [3]  Judy      555-2183

        >>> frame.schema
        [('name', ), ('phone', )]

    The frame to_pandas() method is used to get a pandas DataFrame that contains the data from the spark-tk frame.  Note
    that since no parameters are provided when to_pandas() is called, the default values are used for the number of
    rows, the row offset, and the columns.

        >>> pandas_frame = frame.to_pandas()
        >>> pandas_frame
               name     phone
        0      Fred  555-1234
        1     Susan  555-0202
        2  Thurston  555-4510
        3      Judy  555-2183

    """
    try:
        import pandas
    except:
        raise RuntimeError("pandas module not found, unable to download.  Install pandas or try the take command.")
    from sparktk.frame.ops.take import take_rich

    result = take_rich(self, n, offset, columns)
    headers, data_types = zip(*result.schema)
    frame_data = result.data

    from sparktk import dtypes
    import datetime

    date_time_columns = [i for i, x in enumerate(self.schema) if x[1] in (dtypes.datetime, datetime.datetime)]
    has_date_time = len(date_time_columns) > 0

    # translate our datetime long to datetime, so that it gets into the pandas df as a datetime column
    def long_to_date_time(row):
        for i in date_time_columns:
            if isinstance(row[i], long):
                row[i] = datetime.datetime.fromtimestamp(row[i]//1000).replace(microsecond=row[i]%1000*1000)
        return row

    if (has_date_time):
        frame_data = map(long_to_date_time, frame_data)

    # create pandas df
    pandas_df = pandas.DataFrame(frame_data, columns=headers)

    for i, dtype in enumerate(data_types):
        dtype_str = _sparktk_dtype_to_pandas_str(dtype)
        try:
            pandas_df[[headers[i]]] = pandas_df[[headers[i]]].astype(dtype_str)
        except (TypeError, ValueError):
            if dtype_str.startswith("int"):
                # DataFrame does not handle missing values in int columns. If we get this error, use the 'object' datatype instead.
                print "WARNING - Encountered problem casting column %s to %s, possibly due to missing values (i.e. presence of None).  Continued by casting column %s as 'object'" % (headers[i], dtype_str, headers[i])
                pandas_df[[headers[i]]] = pandas_df[[headers[i]]].astype("object")
            else:
                raise
    return pandas_df

def top_k(

self, column_name, k, weight_column=None)

Most or least frequent column values.

Parameters:

column_name

(str):

The column whose top (or bottom) K distinct values are to be calculated.

(int):

Number of entries to return (If k is negative, return bottom k).

weight_column

(Optional[str]):

The column that provides weights (frequencies) for the topK calculation. Must contain numerical data. Default is 1 for all items.

Calculate the top (or bottom) K distinct values by count of a column. The column can be weighted. All data elements of weight <= 0 are excluded from the calculation, as are all data elements whose weight is NaN or infinite. If there are no data elements of finite weight > 0, then topK is empty.

Examples:

For this example, we calculate the top 2 counties in a data frame:

Consider the following frame:

>>> frame.inspect(frame.count())
[##]  rank  city         population_2013  population_2010  change  county
=============================================================================
[0]      1  Portland              609456           583776  4.40%   Multnomah
[1]      2  Salem                 160614           154637  3.87%   Marion
[2]      3  Eugene                159190           156185  1.92%   Lane
[3]      4  Gresham               109397           105594  3.60%   Multnomah
[4]      5  Hillsboro              97368            91611  6.28%   Washington
[5]      6  Beaverton              93542            89803  4.16%   Washington
[6]     15  Grants Pass            35076            34533  1.57%   Josephine
[7]     16  Oregon City            34622            31859  8.67%   Clackamas
[8]     17  McMinnville            33131            32187  2.93%   Yamhill
[9]     18  Redmond                27427            26215  4.62%   Deschutes
[10]    19  Tualatin               26879            26054  4.17%   Washington
[11]    20  West Linn              25992            25109  3.52%   Clackamas
[12]     7  Bend                   81236            76639  6.00%   Deschutes
[13]     8  Medford                77677            74907  3.70%   Jackson
[14]     9  Springfield            60177            59403  1.30%   Lane
[15]    10  Corvallis              55298            54462  1.54%   Benton
[16]    11  Albany                 51583            50158  2.84%   Linn
[17]    12  Tigard                 50444            48035  5.02%   Washington
[18]    13  Lake Oswego            37610            36619  2.71%   Clackamas
[19]    14  Keizer                 37064            36478  1.61%   Marion

>>> top_frame = frame.top_k("county", 2)
[===Job Progress===]

>>> top_frame.inspect()
[#]  county      count
======================
[0]  Washington    4.0
[1]  Clackamas     3.0

Show source ≡

def top_k(self, column_name, k, weight_column=None):
    """
    Most or least frequent column values.

    Parameters
    ----------

    :param column_name: (str) The column whose top (or bottom) K distinct values are to be calculated.
    :param k: (int) Number of entries to return (If k is negative, return bottom k).
    :param weight_column: (Optional[str]) The column that provides weights (frequencies) for the topK calculation.
                          Must contain numerical data. Default is 1 for all items.

    Calculate the top (or bottom) K distinct values by count of a column. The column can be
    weighted.  All data elements of weight <= 0 are excluded from the calculation, as are
    all data elements whose weight is NaN or infinite. If there are no data elements of
    finite weight > 0, then topK is empty.

    Examples
    --------

    For this example, we calculate the top 2 counties in a data frame:


    Consider the following frame:

        >>> frame.inspect(frame.count())
        [##]  rank  city         population_2013  population_2010  change  county
        =============================================================================
        [0]      1  Portland              609456           583776  4.40%   Multnomah
        [1]      2  Salem                 160614           154637  3.87%   Marion
        [2]      3  Eugene                159190           156185  1.92%   Lane
        [3]      4  Gresham               109397           105594  3.60%   Multnomah
        [4]      5  Hillsboro              97368            91611  6.28%   Washington
        [5]      6  Beaverton              93542            89803  4.16%   Washington
        [6]     15  Grants Pass            35076            34533  1.57%   Josephine
        [7]     16  Oregon City            34622            31859  8.67%   Clackamas
        [8]     17  McMinnville            33131            32187  2.93%   Yamhill
        [9]     18  Redmond                27427            26215  4.62%   Deschutes
        [10]    19  Tualatin               26879            26054  4.17%   Washington
        [11]    20  West Linn              25992            25109  3.52%   Clackamas
        [12]     7  Bend                   81236            76639  6.00%   Deschutes
        [13]     8  Medford                77677            74907  3.70%   Jackson
        [14]     9  Springfield            60177            59403  1.30%   Lane
        [15]    10  Corvallis              55298            54462  1.54%   Benton
        [16]    11  Albany                 51583            50158  2.84%   Linn
        [17]    12  Tigard                 50444            48035  5.02%   Washington
        [18]    13  Lake Oswego            37610            36619  2.71%   Clackamas
        [19]    14  Keizer                 37064            36478  1.61%   Marion

        >>> top_frame = frame.top_k("county", 2)
        [===Job Progress===]

        >>> top_frame.inspect()
        [#]  county      count
        ======================
        [0]  Washington    4.0
        [1]  Clackamas     3.0

    """
    from sparktk.frame.frame import Frame
    return Frame(self._tc, self._scala.topK(column_name, k, self._tc.jutils.convert.to_scala_option(weight_column)))

def unflatten_columns(

self, columns, delimiter=',')

Compacts data from multiple rows based on cell data.

Parameters:

columns

(str or List[str]):

Name of the column(s) to be used as keys for unflattening.

delimiter

(Optional[str]):

Separator for the data in the result columns. Default is comma (,).

Groups together cells in all columns (less the composite key) using "," as string delimiter. The original rows are deleted. The grouping takes place based on a composite key created from cell values. The column datatypes are changed to string.

Examples:

Given a data file::

user1 1/1/2015 1 70
user1 1/1/2015 2 60
user2 1/1/2015 1 65

The commands to bring the data into a frame, where it can be worked on:

>>> frame.inspect()
[#]  a      b         c  d
===========================
[0]  user1  1/1/2015  1  70
[1]  user1  1/1/2015  2  60
[2]  user2  1/1/2015  1  65

Unflatten the data using columns a & b:

>>> frame.unflatten_columns(['a','b'])
[===Job Progress===]

Check again:

>>> frame.inspect()
[#]  a      b         c    d
================================
[0]  user1  1/1/2015  1,2  70,60
[1]  user2  1/1/2015  1    65

Alternatively, unflatten_columns() also accepts a single column like:

>>> frame.unflatten_columns('a')
[===Job Progress===]

>>> frame.inspect()
[#]  a      b                  c    d
=========================================
[0]  user1  1/1/2015,1/1/2015  1,2  70,60
[1]  user2  1/1/2015           1    65

Show source ≡

def unflatten_columns (self, columns, delimiter=","):
    """
    Compacts data from multiple rows based on cell data.

    Parameters
    ----------

    :param columns: (str or List[str]) Name of the column(s) to be used as keys for unflattening.
    :param delimiter: (Optional[str]) Separator for the data in the result columns.  Default is comma (,).

    Groups together cells in all columns (less the composite key) using "," as string delimiter.
    The original rows are deleted.
    The grouping takes place based on a composite key created from cell values.
    The column datatypes are changed to string.

    Examples
    --------


    Given a data file::

        user1 1/1/2015 1 70
        user1 1/1/2015 2 60
        user2 1/1/2015 1 65

    The commands to bring the data into a frame, where it can be worked on:

        >>> frame.inspect()
        [#]  a      b         c  d
        ===========================
        [0]  user1  1/1/2015  1  70
        [1]  user1  1/1/2015  2  60
        [2]  user2  1/1/2015  1  65


    Unflatten the data using columns a & b:

        >>> frame.unflatten_columns(['a','b'])
        [===Job Progress===]

    Check again:

        >>> frame.inspect()
        [#]  a      b         c    d
        ================================
        [0]  user1  1/1/2015  1,2  70,60
        [1]  user2  1/1/2015  1    65

    Alternatively, unflatten_columns() also accepts a single column like:


        >>> frame.unflatten_columns('a')
        [===Job Progress===]

        >>> frame.inspect()
        [#]  a      b                  c    d
        =========================================
        [0]  user1  1/1/2015,1/1/2015  1,2  70,60
        [1]  user2  1/1/2015           1    65

    """
    if not isinstance(columns, list):
        columns = [columns]
    return self._scala.unflattenColumns(self._tc.jutils.convert.to_scala_list_string(columns),
                                        delimiter)

def validate_pyrdd_schema(

self, pyrdd, schema)

Show source ≡

def validate_pyrdd_schema(self, pyrdd, schema):
    if isinstance(pyrdd, RDD):
        schema_length = len(schema)
        num_bad_values = self._tc.sc.accumulator(0)
        def validate_schema(row, accumulator):
            data = []
            if len(row) != schema_length:
                raise ValueError("Length of the row (%s) does not match the schema length (%s)." % (len(row), len(schema)))
            for index, column in enumerate(schema):
                data_type = column[1]
                try:
                    if row[index] is not None:
                        data.append(dtypes.dtypes.cast(row[index], data_type))
                except:
                    data.append(None)
                    accumulator += 1
            return data
        validated_rdd = pyrdd.map(lambda row: validate_schema(row, num_bad_values))
        # Force rdd to load, so that we can get a bad value count
        validated_rdd.count()
        return SchemaValidationReturn(validated_rdd, num_bad_values.value)
    else:
        raise TypeError("Unable to validate schema, because the pyrdd provided is not an RDD.")

Index

Functions

Classes

Functions

Classes

Ancestors (in MRO)

Instance variables

Methods

For other examples, see :ref:example_frame.inspect.

For other examples, see :ref:`example_frame.inspect`.