sparktk.frame.ops.categorical_summary module

Show source ≡

# vim: set encoding=utf-8

#  Copyright (c) 2016 Intel Corporation 
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#       http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
#

from sparktk.propobj import PropertiesObject
from sparktk.frame.ops.inspect import RowsInspection


class CategoricalSummaryOutputList(list):
    def __str__(self):
        return "\n\n".join([str(item) for item in self])

    def __repr__(self):
        return str(self)


class CategoricalSummaryOutput(PropertiesObject):
    """
    CategoricalSummaryOutput class containing the levels with their frequency and percentage for the specified column.
    """
    def __init__(self, scala_result):
        self._column_name = scala_result.column()
        self._levels = [LevelData(item) for item in list(scala_result.levels())]

    @property
    def column_name(self):
        return self._column_name

    @property
    def levels(self):
        return self._levels

    def __str__(self):
        rows = []
        for level_data in self.levels:
            rows.append([level_data.level, level_data.frequency, level_data.percentage])
        schema=[("level", str), ("frequency", int), ("percentage", float)]
        rows = RowsInspection(rows, schema, 0)

        return "column_name = \"{0}\"\n{1}".format(self.column_name, rows)

    def __repr__(self):
        return str(self)


class LevelData(PropertiesObject):
    def __init__(self, scala_result):
        self._level = scala_result.level()
        self._frequency = scala_result.frequency()
        self._percentage = scala_result.percentage()

    @property
    def level(self):
        return self._level

    @property
    def frequency(self):
        return self._frequency

    @property
    def percentage(self):
        return self._percentage


def categorical_summary(self, columns, top_k=None, threshold=None):
    """
    Build summary of the data.

    Parameters
    ----------

    :param columns: (List[CategoricalSummaryInput]) List of CategoricalSummaryInput consisting of column, topk and/or threshold
    :param top_k: (Optional[int]) Displays levels which are in the top k most frequently
            occurring values for that column.
            Default is 10.
    :param threshold: (Optional[float]) Displays levels which are above the threshold percentage with
            respect to the total row count.
            Default is 0.0.
    :return: (List[CategoricalSummaryOutput]) List of CategoricalSummaryOutput objects for specified column(s) consisting of levels with
             their frequency and percentage.

    Compute a summary of the data in a column(s) for categorical or numerical data types.
    The returned value is a Map containing categorical summary for each specified column.

    For each column, levels which satisfy the top k and/or threshold cutoffs are
    displayed along with their frequency and percentage occurrence with respect to
    the total rows in the dataset.

    Performs level pruning first based on top k and then filters
    out levels which satisfy the threshold criterion.

    Missing data is reported when a column value is empty ("") or null.

    All remaining data is grouped together in the Other category and its frequency
    and percentage are reported as well.

    User must specify the column name and can optionally specify top_k and/or threshold.

    Examples
    --------

    Consider Frame *my_frame*, which contains the data


        >>> my_frame.inspect()
        [#]  source           target
        =====================================
        [0]  entity           thing
        [1]  entity           physical_entity
        [2]  entity           abstraction
        [3]  physical_entity  entity
        [4]  physical_entity  matter
        [5]  physical_entity  process
        [6]  physical_entity  thing
        [7]  physical_entity  substance
        [8]  physical_entity  object
        [9]  physical_entity  causal_agent

        >>> cm = my_frame.categorical_summary('source', top_k=2)
        [===Job Progress===]

        >>> cm
        column_name = "source"
        [#]  level        frequency  percentage
        ===========================================
        [0]  thing                9  0.321428571429
        [1]  abstraction          9  0.321428571429
        [2]              0             0.0
        [3]               10  0.357142857143

        >>> cm = my_frame.categorical_summary('source', threshold = 0.5)
        [===Job Progress===]

        >>> cm
        column_name = "source"
        [#]  level      frequency  percentage
        =====================================
        [0]            0         0.0
        [1]             28         1.0

        >>> cm = my_frame.categorical_summary(['source', 'target'], top_k=[2, None], threshold=[None, 0.5])
        [===Job Progress===]

        >>> cm
        column_name = "source"
        [#]  level        frequency  percentage
        ===========================================
        [0]  thing                9  0.321428571429
        [1]  abstraction          9  0.321428571429
        [2]              0             0.0
        [3]               10  0.357142857143
        
        column_name = "target"
        [#]  level      frequency  percentage
        =====================================
        [0]            0         0.0
        [1]             28         1.0

    """
    if not isinstance(columns, list):
        columns = [columns]
    columns = self._tc.jutils.convert.to_scala_list_string(columns)

    if top_k is not None:
        if not isinstance(top_k, list):
            top_k = [top_k]
        top_k = [self._tc.jutils.convert.to_scala_option(item) for item in top_k]
        top_k = self._tc.jutils.convert.to_scala_list(top_k)
    if threshold is not None:
        if not isinstance(threshold, list):
            threshold = [threshold]
        threshold = [self._tc.jutils.convert.to_scala_option(item) for item in threshold]
        threshold = self._tc.jutils.convert.to_scala_list(threshold)
    result_list = list(self._scala.categoricalSummary(columns,
                                                      self._tc.jutils.convert.to_scala_option(top_k),
                                                      self._tc.jutils.convert.to_scala_option(threshold)))
    return CategoricalSummaryOutputList([CategoricalSummaryOutput(item) for item in result_list])

Functions

def categorical_summary(

self, columns, top_k=None, threshold=None)

Build summary of the data.

Parameters:

columns

(List[CategoricalSummaryInput]):

List of CategoricalSummaryInput consisting of column, topk and/or threshold

top_k

(Optional[int]):

Displays levels which are in the top k most frequently occurring values for that column. Default is 10.

threshold

(Optional[float]):

Displays levels which are above the threshold percentage with respect to the total row count. Default is 0.0.

Returns

(List[CategoricalSummaryOutput]):

List of CategoricalSummaryOutput objects for specified column(s) consisting of levels with their frequency and percentage.

Compute a summary of the data in a column(s) for categorical or numerical data types. The returned value is a Map containing categorical summary for each specified column.

For each column, levels which satisfy the top k and/or threshold cutoffs are displayed along with their frequency and percentage occurrence with respect to the total rows in the dataset.

Performs level pruning first based on top k and then filters out levels which satisfy the threshold criterion.

Missing data is reported when a column value is empty ("") or null.

All remaining data is grouped together in the Other category and its frequency and percentage are reported as well.

User must specify the column name and can optionally specify top_k and/or threshold.

Examples:

Consider Frame my_frame, which contains the data

>>> my_frame.inspect()
[#]  source           target
=====================================
[0]  entity           thing
[1]  entity           physical_entity
[2]  entity           abstraction
[3]  physical_entity  entity
[4]  physical_entity  matter
[5]  physical_entity  process
[6]  physical_entity  thing
[7]  physical_entity  substance
[8]  physical_entity  object
[9]  physical_entity  causal_agent

>>> cm = my_frame.categorical_summary('source', top_k=2)
[===Job Progress===]

>>> cm
column_name = "source"
[#]  level        frequency  percentage
===========================================
[0]  thing                9  0.321428571429
[1]  abstraction          9  0.321428571429
[2]  <Missing>            0             0.0
[3]  <Other>             10  0.357142857143

>>> cm = my_frame.categorical_summary('source', threshold = 0.5)
[===Job Progress===]

>>> cm
column_name = "source"
[#]  level      frequency  percentage
=====================================
[0]  <Missing>          0         0.0
[1]  <Other>           28         1.0

>>> cm = my_frame.categorical_summary(['source', 'target'], top_k=[2, None], threshold=[None, 0.5])
[===Job Progress===]

>>> cm
column_name = "source"
[#]  level        frequency  percentage
===========================================
[0]  thing                9  0.321428571429
[1]  abstraction          9  0.321428571429
[2]  <Missing>            0             0.0
[3]  <Other>             10  0.357142857143
<BLANKLINE>
column_name = "target"
[#]  level      frequency  percentage
=====================================
[0]  <Missing>          0         0.0
[1]  <Other>           28         1.0

Show source ≡

def categorical_summary(self, columns, top_k=None, threshold=None):
    """
    Build summary of the data.

    Parameters
    ----------

    :param columns: (List[CategoricalSummaryInput]) List of CategoricalSummaryInput consisting of column, topk and/or threshold
    :param top_k: (Optional[int]) Displays levels which are in the top k most frequently
            occurring values for that column.
            Default is 10.
    :param threshold: (Optional[float]) Displays levels which are above the threshold percentage with
            respect to the total row count.
            Default is 0.0.
    :return: (List[CategoricalSummaryOutput]) List of CategoricalSummaryOutput objects for specified column(s) consisting of levels with
             their frequency and percentage.

    Compute a summary of the data in a column(s) for categorical or numerical data types.
    The returned value is a Map containing categorical summary for each specified column.

    For each column, levels which satisfy the top k and/or threshold cutoffs are
    displayed along with their frequency and percentage occurrence with respect to
    the total rows in the dataset.

    Performs level pruning first based on top k and then filters
    out levels which satisfy the threshold criterion.

    Missing data is reported when a column value is empty ("") or null.

    All remaining data is grouped together in the Other category and its frequency
    and percentage are reported as well.

    User must specify the column name and can optionally specify top_k and/or threshold.

    Examples
    --------

    Consider Frame *my_frame*, which contains the data


        >>> my_frame.inspect()
        [#]  source           target
        =====================================
        [0]  entity           thing
        [1]  entity           physical_entity
        [2]  entity           abstraction
        [3]  physical_entity  entity
        [4]  physical_entity  matter
        [5]  physical_entity  process
        [6]  physical_entity  thing
        [7]  physical_entity  substance
        [8]  physical_entity  object
        [9]  physical_entity  causal_agent

        >>> cm = my_frame.categorical_summary('source', top_k=2)
        [===Job Progress===]

        >>> cm
        column_name = "source"
        [#]  level        frequency  percentage
        ===========================================
        [0]  thing                9  0.321428571429
        [1]  abstraction          9  0.321428571429
        [2]              0             0.0
        [3]               10  0.357142857143

        >>> cm = my_frame.categorical_summary('source', threshold = 0.5)
        [===Job Progress===]

        >>> cm
        column_name = "source"
        [#]  level      frequency  percentage
        =====================================
        [0]            0         0.0
        [1]             28         1.0

        >>> cm = my_frame.categorical_summary(['source', 'target'], top_k=[2, None], threshold=[None, 0.5])
        [===Job Progress===]

        >>> cm
        column_name = "source"
        [#]  level        frequency  percentage
        ===========================================
        [0]  thing                9  0.321428571429
        [1]  abstraction          9  0.321428571429
        [2]              0             0.0
        [3]               10  0.357142857143
        
        column_name = "target"
        [#]  level      frequency  percentage
        =====================================
        [0]            0         0.0
        [1]             28         1.0

    """
    if not isinstance(columns, list):
        columns = [columns]
    columns = self._tc.jutils.convert.to_scala_list_string(columns)

    if top_k is not None:
        if not isinstance(top_k, list):
            top_k = [top_k]
        top_k = [self._tc.jutils.convert.to_scala_option(item) for item in top_k]
        top_k = self._tc.jutils.convert.to_scala_list(top_k)
    if threshold is not None:
        if not isinstance(threshold, list):
            threshold = [threshold]
        threshold = [self._tc.jutils.convert.to_scala_option(item) for item in threshold]
        threshold = self._tc.jutils.convert.to_scala_list(threshold)
    result_list = list(self._scala.categoricalSummary(columns,
                                                      self._tc.jutils.convert.to_scala_option(top_k),
                                                      self._tc.jutils.convert.to_scala_option(threshold)))
    return CategoricalSummaryOutputList([CategoricalSummaryOutput(item) for item in result_list])

Classes

class CategoricalSummaryOutput

CategoricalSummaryOutput class containing the levels with their frequency and percentage for the specified column.

Show source ≡

class CategoricalSummaryOutput(PropertiesObject):
    """
    CategoricalSummaryOutput class containing the levels with their frequency and percentage for the specified column.
    """
    def __init__(self, scala_result):
        self._column_name = scala_result.column()
        self._levels = [LevelData(item) for item in list(scala_result.levels())]

    @property
    def column_name(self):
        return self._column_name

    @property
    def levels(self):
        return self._levels

    def __str__(self):
        rows = []
        for level_data in self.levels:
            rows.append([level_data.level, level_data.frequency, level_data.percentage])
        schema=[("level", str), ("frequency", int), ("percentage", float)]
        rows = RowsInspection(rows, schema, 0)

        return "column_name = \"{0}\"\n{1}".format(self.column_name, rows)

    def __repr__(self):
        return str(self)

Ancestors (in MRO)

CategoricalSummaryOutput
sparktk.propobj.PropertiesObject
__builtin__.object

Instance variables

var column_name

var levels

Methods

def __init__(

self, scala_result)

Show source ≡

def __init__(self, scala_result):
    self._column_name = scala_result.column()
    self._levels = [LevelData(item) for item in list(scala_result.levels())]

def to_dict(

self)

Show source ≡

def to_dict(self):
    d = self._properties()
    d.update(self._attributes())
    return d

def to_json(

self)

Show source ≡

def to_json(self):
    return json.dumps(self.to_dict())

class CategoricalSummaryOutputList

Show source ≡

class CategoricalSummaryOutputList(list):
    def __str__(self):
        return "\n\n".join([str(item) for item in self])

    def __repr__(self):
        return str(self)

Ancestors (in MRO)

CategoricalSummaryOutputList
__builtin__.list
__builtin__.object

class LevelData

Show source ≡

class LevelData(PropertiesObject):
    def __init__(self, scala_result):
        self._level = scala_result.level()
        self._frequency = scala_result.frequency()
        self._percentage = scala_result.percentage()

    @property
    def level(self):
        return self._level

    @property
    def frequency(self):
        return self._frequency

    @property
    def percentage(self):
        return self._percentage

Ancestors (in MRO)

LevelData
sparktk.propobj.PropertiesObject
__builtin__.object

Instance variables

var frequency

var level

var percentage

Methods

def __init__(

self, scala_result)

Show source ≡

def __init__(self, scala_result):
    self._level = scala_result.level()
    self._frequency = scala_result.frequency()
    self._percentage = scala_result.percentage()

def to_dict(

self)

Show source ≡

def to_dict(self):
    d = self._properties()
    d.update(self._attributes())
    return d

def to_json(

self)

Show source ≡

def to_json(self):
    return json.dumps(self.to_dict())

Index

Functions

Classes

Functions

Classes

Ancestors (in MRO)

Instance variables

Methods

Ancestors (in MRO)

Ancestors (in MRO)

Instance variables

Methods