sparktk.frame.ops.topk module
# vim: set encoding=utf-8
# Copyright (c) 2016 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
def top_k(self, column_name, k, weight_column=None):
"""
Most or least frequent column values.
Parameters
----------
:param column_name: (str) The column whose top (or bottom) K distinct values are to be calculated.
:param k: (int) Number of entries to return (If k is negative, return bottom k).
:param weight_column: (Optional[str]) The column that provides weights (frequencies) for the topK calculation.
Must contain numerical data. Default is 1 for all items.
Calculate the top (or bottom) K distinct values by count of a column. The column can be
weighted. All data elements of weight <= 0 are excluded from the calculation, as are
all data elements whose weight is NaN or infinite. If there are no data elements of
finite weight > 0, then topK is empty.
Examples
--------
For this example, we calculate the top 2 counties in a data frame:
Consider the following frame:
>>> frame.inspect(frame.count())
[##] rank city population_2013 population_2010 change county
=============================================================================
[0] 1 Portland 609456 583776 4.40% Multnomah
[1] 2 Salem 160614 154637 3.87% Marion
[2] 3 Eugene 159190 156185 1.92% Lane
[3] 4 Gresham 109397 105594 3.60% Multnomah
[4] 5 Hillsboro 97368 91611 6.28% Washington
[5] 6 Beaverton 93542 89803 4.16% Washington
[6] 15 Grants Pass 35076 34533 1.57% Josephine
[7] 16 Oregon City 34622 31859 8.67% Clackamas
[8] 17 McMinnville 33131 32187 2.93% Yamhill
[9] 18 Redmond 27427 26215 4.62% Deschutes
[10] 19 Tualatin 26879 26054 4.17% Washington
[11] 20 West Linn 25992 25109 3.52% Clackamas
[12] 7 Bend 81236 76639 6.00% Deschutes
[13] 8 Medford 77677 74907 3.70% Jackson
[14] 9 Springfield 60177 59403 1.30% Lane
[15] 10 Corvallis 55298 54462 1.54% Benton
[16] 11 Albany 51583 50158 2.84% Linn
[17] 12 Tigard 50444 48035 5.02% Washington
[18] 13 Lake Oswego 37610 36619 2.71% Clackamas
[19] 14 Keizer 37064 36478 1.61% Marion
>>> top_frame = frame.top_k("county", 2)
[===Job Progress===]
>>> top_frame.inspect()
[#] county count
======================
[0] Washington 4.0
[1] Clackamas 3.0
"""
from sparktk.frame.frame import Frame
return Frame(self._tc, self._scala.topK(column_name, k, self._tc.jutils.convert.to_scala_option(weight_column)))
Functions
def top_k(
self, column_name, k, weight_column=None)
Most or least frequent column values.
Parameters:
column_name | (str): | The column whose top (or bottom) K distinct values are to be calculated. |
k | (int): | Number of entries to return (If k is negative, return bottom k). |
weight_column | (Optional[str]): | The column that provides weights (frequencies) for the topK calculation. Must contain numerical data. Default is 1 for all items. |
Calculate the top (or bottom) K distinct values by count of a column. The column can be weighted. All data elements of weight <= 0 are excluded from the calculation, as are all data elements whose weight is NaN or infinite. If there are no data elements of finite weight > 0, then topK is empty.
Examples:
For this example, we calculate the top 2 counties in a data frame:
Consider the following frame:
>>> frame.inspect(frame.count())
[##] rank city population_2013 population_2010 change county
=============================================================================
[0] 1 Portland 609456 583776 4.40% Multnomah
[1] 2 Salem 160614 154637 3.87% Marion
[2] 3 Eugene 159190 156185 1.92% Lane
[3] 4 Gresham 109397 105594 3.60% Multnomah
[4] 5 Hillsboro 97368 91611 6.28% Washington
[5] 6 Beaverton 93542 89803 4.16% Washington
[6] 15 Grants Pass 35076 34533 1.57% Josephine
[7] 16 Oregon City 34622 31859 8.67% Clackamas
[8] 17 McMinnville 33131 32187 2.93% Yamhill
[9] 18 Redmond 27427 26215 4.62% Deschutes
[10] 19 Tualatin 26879 26054 4.17% Washington
[11] 20 West Linn 25992 25109 3.52% Clackamas
[12] 7 Bend 81236 76639 6.00% Deschutes
[13] 8 Medford 77677 74907 3.70% Jackson
[14] 9 Springfield 60177 59403 1.30% Lane
[15] 10 Corvallis 55298 54462 1.54% Benton
[16] 11 Albany 51583 50158 2.84% Linn
[17] 12 Tigard 50444 48035 5.02% Washington
[18] 13 Lake Oswego 37610 36619 2.71% Clackamas
[19] 14 Keizer 37064 36478 1.61% Marion
>>> top_frame = frame.top_k("county", 2)
[===Job Progress===]
>>> top_frame.inspect()
[#] county count
======================
[0] Washington 4.0
[1] Clackamas 3.0
def top_k(self, column_name, k, weight_column=None):
"""
Most or least frequent column values.
Parameters
----------
:param column_name: (str) The column whose top (or bottom) K distinct values are to be calculated.
:param k: (int) Number of entries to return (If k is negative, return bottom k).
:param weight_column: (Optional[str]) The column that provides weights (frequencies) for the topK calculation.
Must contain numerical data. Default is 1 for all items.
Calculate the top (or bottom) K distinct values by count of a column. The column can be
weighted. All data elements of weight <= 0 are excluded from the calculation, as are
all data elements whose weight is NaN or infinite. If there are no data elements of
finite weight > 0, then topK is empty.
Examples
--------
For this example, we calculate the top 2 counties in a data frame:
Consider the following frame:
>>> frame.inspect(frame.count())
[##] rank city population_2013 population_2010 change county
=============================================================================
[0] 1 Portland 609456 583776 4.40% Multnomah
[1] 2 Salem 160614 154637 3.87% Marion
[2] 3 Eugene 159190 156185 1.92% Lane
[3] 4 Gresham 109397 105594 3.60% Multnomah
[4] 5 Hillsboro 97368 91611 6.28% Washington
[5] 6 Beaverton 93542 89803 4.16% Washington
[6] 15 Grants Pass 35076 34533 1.57% Josephine
[7] 16 Oregon City 34622 31859 8.67% Clackamas
[8] 17 McMinnville 33131 32187 2.93% Yamhill
[9] 18 Redmond 27427 26215 4.62% Deschutes
[10] 19 Tualatin 26879 26054 4.17% Washington
[11] 20 West Linn 25992 25109 3.52% Clackamas
[12] 7 Bend 81236 76639 6.00% Deschutes
[13] 8 Medford 77677 74907 3.70% Jackson
[14] 9 Springfield 60177 59403 1.30% Lane
[15] 10 Corvallis 55298 54462 1.54% Benton
[16] 11 Albany 51583 50158 2.84% Linn
[17] 12 Tigard 50444 48035 5.02% Washington
[18] 13 Lake Oswego 37610 36619 2.71% Clackamas
[19] 14 Keizer 37064 36478 1.61% Marion
>>> top_frame = frame.top_k("county", 2)
[===Job Progress===]
>>> top_frame.inspect()
[#] county count
======================
[0] Washington 4.0
[1] Clackamas 3.0
"""
from sparktk.frame.frame import Frame
return Frame(self._tc, self._scala.topK(column_name, k, self._tc.jutils.convert.to_scala_option(weight_column)))