зеркало из https://github.com/mozilla/bugbug.git
50 строки
1.7 KiB
Python
50 строки
1.7 KiB
Python
# -*- coding: utf-8 -*-
|
|
# This Source Code Form is subject to the terms of the Mozilla Public
|
|
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
|
|
# You can obtain one at http://mozilla.org/MPL/2.0/.
|
|
|
|
from typing import Dict
|
|
|
|
from sklearn.base import BaseEstimator
|
|
from sklearn.base import TransformerMixin
|
|
|
|
|
|
# From http://scikit-learn.org/stable/auto_examples/hetero_feature_union.html.
|
|
class ItemSelector(BaseEstimator, TransformerMixin):
|
|
"""For data grouped by feature, select subset of data at a provided key.
|
|
|
|
The data is expected to be stored in a 2D data structure, where the first
|
|
index is over features and the second is over samples. i.e.
|
|
|
|
>> len(data[key]) == n_samples
|
|
|
|
Please note that this is the opposite convention to scikit-learn feature
|
|
matrixes (where the first index corresponds to sample).
|
|
|
|
ItemSelector only requires that the collection implement getitem
|
|
(data[key]). Examples include: a dict of lists, 2D numpy array, Pandas
|
|
DataFrame, numpy record array, etc.
|
|
|
|
>> data = {'a': [1, 5, 2, 5, 2, 8],
|
|
'b': [9, 4, 1, 4, 1, 3]}
|
|
>> ds = ItemSelector(key='a')
|
|
>> data['a'] == ds.transform(data)
|
|
|
|
ItemSelector is not designed to handle data grouped by sample. (e.g. a
|
|
list of dicts). If your data is structured this way, consider a
|
|
transformer along the lines of `sklearn.feature_extraction.DictVectorizer`.
|
|
|
|
Parameters
|
|
----------
|
|
key : hashable, required
|
|
The key corresponding to the desired value in a mappable.
|
|
"""
|
|
def __init__(self, key):
|
|
self.key = key
|
|
|
|
def fit(self, x, y=None):
|
|
return self
|
|
|
|
def transform(self, data_dict: Dict):
|
|
return data_dict[self.key]
|