Alternative solutions to sample-aligned meta-data

This page contains alternative solutions that have been discussed and finally not considered in the SLEP.

Solution sketches require these definitions:

import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import make_scorer
from sklearn.model_selection import GroupKFold, cross_validate
from sklearn.pipeline import make_pipeline

N, M = 100, 4
X = np.random.rand(N, M)
y = np.random.randint(0, 2, size=N)
my_groups = np.random.randint(0, 10, size=N)
my_weights = np.random.rand(N)
my_other_weights = np.random.rand(N)

Status quo solution 0a: additional feature

Without changing scikit-learn, the following hack can be used:

Additional numeric features representing sample props can be appended to the data and passed around, being handled specially in each consumer of features or sample props.

import numpy as np

from defs import (GroupKFold, get_scorer, SelectKBest,
                  LogisticRegressionCV, cross_validate,
                  make_pipeline, X, y, my_groups, my_weights)

# %%
# Case A: weighted scoring and fitting


GROUPS_IDX = -1
WEIGHT_IDX = -2


def unwrap_X(X):
    return X[:, -2:]


class WrappedGroupCV:
    def __init__(self, base_cv, groups_idx=GROUPS_IDX):
        self.base_cv = base_cv
        self.groups_idx = groups_idx

    def split(self, X, y, groups=None):
        groups = X[:, self.groups_idx]
        return self.base_cv.split(unwrap_X(X), y, groups=groups)

    def get_n_splits(self, X, y, groups=None):
        groups = X[:, self.groups_idx]
        return self.base_cv.get_n_splits(unwrap_X(X), y, groups=groups)


wrapped_group_cv = WrappedGroupCV(GroupKFold())


class WrappedLogisticRegressionCV(LogisticRegressionCV):
    def fit(self, X, y):
        return super().fit(unwrap_X(X), y, sample_weight=X[:, WEIGHT_IDX])


acc_scorer = get_scorer('accuracy')


def wrapped_weighted_acc(est, X, y, sample_weight=None):
    return acc_scorer(est, unwrap_X(X), y, sample_weight=X[:, WEIGHT_IDX])


lr = WrappedLogisticRegressionCV(
    cv=wrapped_group_cv,
    scoring=wrapped_weighted_acc,
).set_props_request(['sample_weight'])
cross_validate(lr, np.hstack([X, my_weights, my_groups]), y,
               cv=wrapped_group_cv,
               scoring=wrapped_weighted_acc)

# %%
# Case B: weighted scoring and unweighted fitting

class UnweightedWrappedLogisticRegressionCV(LogisticRegressionCV):
    def fit(self, X, y):
        return super().fit(unwrap_X(X), y)


lr = UnweightedWrappedLogisticRegressionCV(
    cv=wrapped_group_cv,
    scoring=wrapped_weighted_acc,
).set_props_request(['sample_weight'])
cross_validate(lr, np.hstack([X, my_weights, my_groups]), y,
               cv=wrapped_group_cv,
               scoring=wrapped_weighted_acc)


# %%
# Case C: unweighted feature selection

class UnweightedWrappedSelectKBest(SelectKBest):
    def fit(self, X, y):
        return super().fit(unwrap_X(X), y)


lr = WrappedLogisticRegressionCV(
    cv=wrapped_group_cv,
    scoring=wrapped_weighted_acc,
).set_props_request(['sample_weight'])
sel = UnweightedWrappedSelectKBest()
pipe = make_pipeline(sel, lr)
cross_validate(pipe, np.hstack([X, my_weights, my_groups]), y,
               cv=wrapped_group_cv,
               scoring=wrapped_weighted_acc)

# %%
# Case D: different scoring and fitting weights

SCORING_WEIGHT_IDX = -3

# TODO: proceed from here. Note that this change implies the need to add
# a parameter to unwrap_X, since we will now append an additional column to X.

Status quo solution 0b: Pandas Index and global resources

Without changing scikit-learn, the following hack can be used:

If y is represented with a Pandas datatype, then its index can be used to access required elements from props stored in a global namespace (or otherwise made available to the estimator before fitting). This is possible everywhere that a ground-truth y is passed, including fit, split, score, and metrics. A similar solution with X is also possible (except for metrics), if all Pipeline components retain the original Pandas Index.

Issues:

use of global data source
requires Pandas data types and indices to be maintained

import pandas as pd
from defs import (get_scorer, SelectKBest,
                  LogisticRegressionCV, cross_validate,
                  make_pipeline, X, y, my_groups, my_weights,
                  my_other_weights)

X = pd.DataFrame(X)
MY_GROUPS = pd.Series(my_groups)
MY_WEIGHTS = pd.Series(my_weights)
MY_OTHER_WEIGHTS = pd.Series(my_other_weights)

# %%
# Case A: weighted scoring and fitting


class WrappedGroupCV:
    def __init__(self, base_cv):
        self.base_cv = base_cv

    def split(self, X, y, groups=None):
        return self.base_cv.split(X, y, groups=MY_GROUPS.loc[X.index])

    def get_n_splits(self, X, y, groups=None):
        return self.base_cv.get_n_splits(X, y, groups=MY_GROUPS.loc[X.index])


wrapped_group_cv = WrappedGroupCV(GroupKFold())


class WeightedLogisticRegressionCV(LogisticRegressionCV):
    def fit(self, X, y):
        return super().fit(X, y, sample_weight=MY_WEIGHTS.loc[X.index])


acc_scorer = get_scorer('accuracy')


def wrapped_weighted_acc(est, X, y, sample_weight=None):
    return acc_scorer(est, X, y, sample_weight=MY_WEIGHTS.loc[X.index])


lr = WeightedLogisticRegressionCV(
    cv=wrapped_group_cv,
    scoring=wrapped_weighted_acc,
).set_props_request(['sample_weight'])
cross_validate(lr, X, y,
               cv=wrapped_group_cv,
               scoring=wrapped_weighted_acc)

# %%
# Case B: weighted scoring and unweighted fitting

lr = LogisticRegressionCV(
    cv=wrapped_group_cv,
    scoring=wrapped_weighted_acc,
).set_props_request(['sample_weight'])
cross_validate(lr, X, y,
               cv=wrapped_group_cv,
               scoring=wrapped_weighted_acc)


# %%
# Case C: unweighted feature selection

lr = WeightedLogisticRegressionCV(
    cv=wrapped_group_cv,
    scoring=wrapped_weighted_acc,
).set_props_request(['sample_weight'])
sel = SelectKBest()
pipe = make_pipeline(sel, lr)
cross_validate(pipe, X, y,
               cv=wrapped_group_cv,
               scoring=wrapped_weighted_acc)

# %%
# Case D: different scoring and fitting weights


def other_weighted_acc(est, X, y, sample_weight=None):
    return acc_scorer(est, X, y, sample_weight=MY_OTHER_WEIGHTS.loc[X.index])


lr = WeightedLogisticRegressionCV(
    cv=wrapped_group_cv,
    scoring=other_weighted_acc,
).set_props_request(['sample_weight'])
sel = SelectKBest()
pipe = make_pipeline(sel, lr)
cross_validate(pipe, X, y,
               cv=wrapped_group_cv,
               scoring=other_weighted_acc)

Solution 1: Pass everything

This proposal passes all props to all consumers (estimators, splitters, scorers, etc). The consumer would optionally use props it is familiar with by name and disregard other props.

We may consider providing syntax for the user to control the interpretation of incoming props:

to require that some prop is provided (for an estimator where that prop is otherwise optional)
to disregard some provided prop
to treat a particular prop key as having a certain meaning (e.g. locally interpreting ‘scoring_sample_weight’ as ‘sample_weight’).

These constraints would be checked by calling a helper at the consumer.

Issues:

Error handling: if a key is optional in a consumer, no error will be raised for misspelling. An introspection API might change this, allowing a user or meta-estimator to check if all keys passed are to be used in at least one consumer.
Forwards compatibility: newly supporting a prop key in a consumer will change behaviour. Other than a ChangedBehaviorWarning, I don’t see any way around this.
Introspection: not inherently supported. Would need an API like get_prop_support(names: List[str]) -> Dict[str, Literal["supported", "required", "ignored"]].

In short, this is a simple solution, but prone to risk.

from defs import (accuracy_score, GroupKFold, make_scorer, SelectKBest,
                  LogisticRegressionCV, cross_validate, make_pipeline, X, y,
                  my_groups, my_weights, my_other_weights)

# %%
# Case A: weighted scoring and fitting

lr = LogisticRegressionCV(
    cv=GroupKFold(),
    scoring='accuracy',
)
cross_validate(lr, X, y, cv=GroupKFold(),
               props={'sample_weight': my_weights, 'groups': my_groups},
               scoring='accuracy')

# Error handling: if props={'sample_eight': my_weights, ...} was passed
# instead, the estimator would fit and score without weight, silently failing.

# %%
# Case B: weighted scoring and unweighted fitting


class MyLogisticRegressionCV(LogisticRegressionCV):
    def fit(self, X, y, props=None):
        props = props.copy()
        props.pop('sample_weight', None)
        super().fit(X, y, props=props)


# %%
# Case C: unweighted feature selection

# Currently feature selection does not handle sample_weight, and as long as
# that remains the case, it will simply ignore the prop passed to it. Hence:

lr = LogisticRegressionCV(
    cv=GroupKFold(),
    scoring='accuracy',
)
sel = SelectKBest()
pipe = make_pipeline(sel, lr)
cross_validate(pipe, X, y, cv=GroupKFold(),
               props={'sample_weight': my_weights, 'groups': my_groups},
               scoring='accuracy')

# %%
# Case D: different scoring and fitting weights

weighted_acc = make_scorer(accuracy_score)


def specially_weighted_acc(est, X, y, props):
    props = props.copy()
    props['sample_weight'] = 'scoring_weight'
    return weighted_acc(est, X, y, props)


lr = LogisticRegressionCV(
    cv=GroupKFold(),
    scoring=specially_weighted_acc,
)
cross_validate(lr, X, y, cv=GroupKFold(),
               props={
                    'scoring_weight': my_weights,
                    'sample_weight': my_other_weights,
                    'groups': my_groups,
               },
               scoring=specially_weighted_acc)

Solution 2: Specify routes at call

Similar to the legacy behavior of fit parameters in sklearn.pipeline.Pipeline, this requires the user to specify the path for each “prop” to follow when calling fit. For example, to pass a prop named ‘weights’ to a step named ‘spam’ in a Pipeline, you might use my_pipe.fit(X, y, props={'spam__weights': my_weights}).

SLEP004’s syntax to override the common routing scheme falls under this solution.

Advantages:

Very explicit and robust to misspellings.

Issues:

The user needs to know the nested internal structure, or it is easy to fail to pass a prop to a specific estimator.
A corollary is that prop keys need changing when the developer modifies their estimator structure (see case C).
This gets especially tricky or impossible where the available routes change mid-fit, such as where a grid search considers estimators with different structures.
We would need to find a different solution for #2630 where a Pipeline could not be the base estimator of AdaBoost because AdaBoost expects the base estimator to accept a fit param keyed ‘sample_weight’.
This may not work if a meta-estimator were to have the role of changing a prop, e.g. a meta-estimator that passes sample_weight corresponding to balanced classes onto its base estimator. The meta-estimator would need a list of destinations to pass modified props to, or a list of keys to modify.
We would need to develop naming conventions for different routes, which may be more complicated than the current conventions; while a GridSearchCV wrapping a Pipeline currently takes parameters with keys like {step_name}__{prop_name}, this explicit routing, and conflict with GridSearchCV routing destinations, implies keys like estimator__{step_name}__{prop_name}.

from defs import (GroupKFold, SelectKBest, LogisticRegressionCV,
                  cross_validate, make_pipeline, X, y, my_groups,
                  my_weights, my_other_weights)

# %%
# Case A: weighted scoring and fitting

lr = LogisticRegressionCV(
    cv=GroupKFold(),
    scoring='accuracy',
)
props = {'cv__groups': my_groups,
         'estimator__cv__groups': my_groups,
         'estimator__sample_weight': my_weights,
         'scoring__sample_weight': my_weights,
         'estimator__scoring__sample_weight': my_weights}
cross_validate(lr, X, y, cv=GroupKFold(),
               props=props,
               scoring='accuracy')

# error handling: if props={'estimator__sample_eight': my_weights, ...} was
# passed instead, the estimator would raise an error.

# %%
# Case B: weighted scoring and unweighted fitting

lr = LogisticRegressionCV(
    cv=GroupKFold(),
    scoring='accuracy',
)
props = {'cv__groups': my_groups,
         'estimator__cv__groups': my_groups,
         'scoring__sample_weight': my_weights,
         'estimator__scoring__sample_weight': my_weights}
cross_validate(lr, X, y, cv=GroupKFold(),
               props=props,
               scoring='accuracy')

# %%
# Case C: unweighted feature selection

lr = LogisticRegressionCV(
    cv=GroupKFold(),
    scoring='accuracy',
)
pipe = make_pipeline(SelectKBest(), lr)
props = {'cv__groups': my_groups,
         'estimator__logisticregressioncv__cv__groups': my_groups,
         'estimator__logisticregressioncv__sample_weight': my_weights,
         'scoring__sample_weight': my_weights,
         'estimator__scoring__sample_weight': my_weights}
cross_validate(pipe, X, y, cv=GroupKFold(),
               props=props,
               scoring='accuracy')

# %%
# Case D: different scoring and fitting weights

lr = LogisticRegressionCV(
    cv=GroupKFold(),
    scoring='accuracy',
)
props = {'cv__groups': my_groups,
         'estimator__cv__groups': my_groups,
         'estimator__sample_weight': my_other_weights,
         'scoring__sample_weight': my_weights,
         'estimator__scoring__sample_weight': my_weights}
cross_validate(lr, X, y, cv=GroupKFold(),
               props=props,
               scoring='accuracy')

Solution 3: Specify routes on metaestimators

Each meta-estimator is given a routing specification which it must follow in passing only the required parameters to each of its children. In this context, a GridSearchCV has children including estimator, cv and (each element of) scoring.

Pull request #9566 and its extension in #15425 are partial implementations of this approach.

A major benefit of this approach is that it may allow only prop routing meta-estimators to be modified, not prop consumers.

All consumers would be required to check that

Issues:

Routing may be hard to get one’s head around, especially since the prop support belongs to the child estimator but the parent is responsible for the routing.
Need to design an API for specifying routings.
As in Solution 2, each local destination for routing props needs to be given a name.
Every router along the route will need consistent instructions to pass a specific prop to a consumer. If the prop is optional in the consumer, routing failures may be hard to identify and debug.
For estimators to be cloned, this routing information needs to be cloned with it. This implies one of: the routing information be stored as a constructor parameter; or clone is extended to explicitly copy routing information.

Possible public syntax:

Each meta-estimator has a prop_routing parameter to encode local routing rules, and a set of named children which it routes to. In #9566, the prop_routing entry for each child may be a white list or black list of named keys passed to the meta-estimator.

from defs import (SelectKBest, LogisticRegressionCV,
                  GroupKFold, cross_validate, make_pipeline, X, y, my_groups,
                  my_weights, my_other_weights)

# %%
# Case A: weighted scoring and fitting

lr = LogisticRegressionCV(
    cv=GroupKFold(),
    scoring='accuracy',
    prop_routing={'cv': ['groups'],
                  'scoring': ['sample_weight'],
                  }
    # one question here is whether we need to explicitly route sample_weight
    # to LogisticRegressionCV's fitting...
)

# Alternative syntax, which assumes cv receives 'groups' by default, and that a
# method-based API is provided on meta-estimators:
#   lr = LogisticRegressionCV(
#       cv=GroupKFold(),
#       scoring='accuracy',
#   ).add_prop_route(scoring='sample_weight')

cross_validate(lr, X, y, cv=GroupKFold(),
               props={'sample_weight': my_weights, 'groups': my_groups},
               scoring='accuracy',
               prop_routing={'estimator': '*',  # pass all props
                             'cv': ['groups'],
                             'scoring': ['sample_weight'],
                             })

# Error handling: if props={'sample_eight': my_weights, ...} was passed
# instead, LogisticRegressionCV would have to identify that a key was passed
# that could not be routed nor used, in order to raise an error.

# %%
# Case B: weighted scoring and unweighted fitting

# Here we rename the sample_weight prop so that we can specify that it only
# applies to scoring.
lr = LogisticRegressionCV(
    cv=GroupKFold(),
    scoring='accuracy',
    prop_routing={'cv': ['groups'],
                  # read the following as "scoring should consume
                  # 'scoring_weight' as if it were 'sample_weight'."
                  'scoring': {'sample_weight': 'scoring_weight'},
                  },
)
cross_validate(lr, X, y, cv=GroupKFold(),
               props={'scoring_weight': my_weights, 'groups': my_groups},
               scoring='accuracy',
               prop_routing={'estimator': '*',
                             'cv': ['groups'],
                             'scoring': {'sample_weight': 'scoring_weight'},
                             })

# %%
# Case C: unweighted feature selection

lr = LogisticRegressionCV(
    cv=GroupKFold(),
    scoring='accuracy',
    prop_routing={'cv': ['groups'],
                  'scoring': ['sample_weight'],
                  })
pipe = make_pipeline(SelectKBest(), lr,
                     prop_routing={'logisticregressioncv': ['sample_weight',
                                                            'groups']})
cross_validate(lr, X, y, cv=GroupKFold(),
               props={'sample_weight': my_weights, 'groups': my_groups},
               scoring='accuracy',
               prop_routing={'estimator': '*',
                             'cv': ['groups'],
                             'scoring': ['sample_weight'],
                             })

# %%
# Case D: different scoring and fitting weights
lr = LogisticRegressionCV(
    cv=GroupKFold(),
    scoring='accuracy',
    prop_routing={'cv': ['groups'],
                  # read the following as "scoring should consume
                  # 'scoring_weight' as if it were 'sample_weight'."
                  'scoring': {'sample_weight': 'scoring_weight'},
                  },
)
cross_validate(lr, X, y, cv=GroupKFold(),
               props={'scoring_weight': my_weights, 'groups': my_groups,
                      'fitting_weight': my_other_weights},
               scoring='accuracy',
               prop_routing={'estimator': {'sample_weight': 'fitting_weight',
                                           'scoring_weight': 'scoring_weight',
                                           'groups': 'groups'},
                             'cv': ['groups'],
                             'scoring': {'sample_weight': 'scoring_weight'},
                             })