Up

sparktk.frame.ops.drop_columns module

# vim: set encoding=utf-8

#  Copyright (c) 2016 Intel Corporation 
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#       http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
#

from sparktk.frame.pyframe import PythonFrame
from sparktk.frame.schema import get_indices_for_selected_columns


def drop_columns(self, columns):
    """
    Drops columns from the frame

    Parameters
    ----------

    :param columns: (str or List[str]) names of the columns to drop

    Examples
    --------

    For this example, the Frame object *my_frame* accesses a frame with 4 columns
    columns *column_a*, *column_b*, *column_c* and *column_d* and drops 2 columns *column_b* and *column_d* using drop columns.



        >>> print frame.schema
        [('column_a', ), ('column_b', ), ('column_c', ), ('column_d', )]


    Eliminate columns *column_b* and *column_d*:

        >>> frame.drop_columns(["column_b", "column_d"])
        >>> print frame.schema
        [('column_a', ), ('column_c', )]

    Now the frame only has the columns *column_a* and *column_c*.
    For further examples, see: ref:`example_frame.drop_columns`.

    """
    if isinstance(columns, basestring):
        columns = [columns]
    if self._is_scala:
        self._scala.dropColumns(self._tc.jutils.convert.to_scala_vector_string(columns))
    else:
        victim_indices = get_indices_for_selected_columns(self.schema, columns)
        survivor_indices = [i for i in xrange(len(self.schema)) if i not in victim_indices]
        filtered_schema = [self.schema[i] for i in survivor_indices]

        def filter_fields(row):
            return [row[i] for i in survivor_indices]
        filtered_rdd = self.rdd.map(filter_fields)
        self._frame = PythonFrame(filtered_rdd, filtered_schema)

Functions

def drop_columns(

self, columns)

Drops columns from the frame

Parameters:
columns(str or List[str]):names of the columns to drop
Examples:

For this example, the Frame object my_frame accesses a frame with 4 columns columns column_a, column_b, column_c and column_d and drops 2 columns column_b and column_d using drop columns.

>>> print frame.schema
[('column_a', <type 'str'>), ('column_b', <type 'int'>), ('column_c', <type 'str'>), ('column_d', <type 'int'>)]

Eliminate columns column_b and column_d:

>>> frame.drop_columns(["column_b", "column_d"])
>>> print frame.schema
[('column_a', <type 'str'>), ('column_c', <type 'str'>)]

Now the frame only has the columns column_a and column_c. For further examples, see: ref:example_frame.drop_columns.

def drop_columns(self, columns):
    """
    Drops columns from the frame

    Parameters
    ----------

    :param columns: (str or List[str]) names of the columns to drop

    Examples
    --------

    For this example, the Frame object *my_frame* accesses a frame with 4 columns
    columns *column_a*, *column_b*, *column_c* and *column_d* and drops 2 columns *column_b* and *column_d* using drop columns.



        >>> print frame.schema
        [('column_a', ), ('column_b', ), ('column_c', ), ('column_d', )]


    Eliminate columns *column_b* and *column_d*:

        >>> frame.drop_columns(["column_b", "column_d"])
        >>> print frame.schema
        [('column_a', ), ('column_c', )]

    Now the frame only has the columns *column_a* and *column_c*.
    For further examples, see: ref:`example_frame.drop_columns`.

    """
    if isinstance(columns, basestring):
        columns = [columns]
    if self._is_scala:
        self._scala.dropColumns(self._tc.jutils.convert.to_scala_vector_string(columns))
    else:
        victim_indices = get_indices_for_selected_columns(self.schema, columns)
        survivor_indices = [i for i in xrange(len(self.schema)) if i not in victim_indices]
        filtered_schema = [self.schema[i] for i in survivor_indices]

        def filter_fields(row):
            return [row[i] for i in survivor_indices]
        filtered_rdd = self.rdd.map(filter_fields)
        self._frame = PythonFrame(filtered_rdd, filtered_schema)