Source code for pharmpy.modeling.ml

from pathlib import Path

from pharmpy.deps import numpy as np
from pharmpy.deps import pandas as pd

from ..model import ModelfitResultsError
from .common import get_model_covariates
from .data import (
    get_ids,
    get_number_of_individuals,
    get_number_of_observations,
    get_number_of_observations_per_individual,
)


def _all_parameters(model):
    # All parameter estimates including fixed
    # This might warrant a function in modeling
    d = {p.name: p.init for p in model.parameters}
    d.update(dict(model.modelfit_results.parameter_estimates))
    return pd.Series(d)


def _create_dataset(model):
    res = model.modelfit_results
    if res is None:
        raise ModelfitResultsError("Need ModelfitResults for model")
    if res.residuals is None:
        raise ModelfitResultsError("No residuals available in ModelfitResults")
    if 'CWRES' not in res.residuals.columns:
        raise ModelfitResultsError("CWRES not an available residual in ModelfitResults")
    idcol = model.datainfo.id_column.name
    nids = get_number_of_individuals(model)
    nobs = get_number_of_observations(model)
    nobsi = get_number_of_observations_per_individual(model)
    cwres = res.residuals['CWRES']
    iofv = res.individual_estimates
    npar = len(model.parameters)
    ofv = res.ofv

    # Max ratio of abs(ETAi) and omegai
    variance_omegas = [
        model.random_variables.get_variance(rv.name).name for rv in model.random_variables.etas
    ]
    all_paramvals = _all_parameters(model)
    omega_estimates = np.sqrt(all_paramvals[variance_omegas])
    abs_ebes = res.individual_estimates.abs()
    ebe_ratio = abs_ebes / list(omega_estimates)
    max_ebe_ratio = ebe_ratio.max(axis=1).fillna(1.0)
    # Set to 1 if division by zero, e.g. from omega fix to 0

    # exp(OFVi / nobsi) / exp(OFV / nobs)
    iofv = res.individual_ofv
    ofv_ratio = np.exp(iofv / nobsi) / np.exp(model.modelfit_results.ofv / nobs)

    # mean(ETA / OMEGA)
    cov = res.individual_estimates_covariance
    etc_diag = np.sqrt(pd.DataFrame([np.diag(y) for y in cov], columns=cov.iloc[0].columns))
    etc_ratio = etc_diag / list(omega_estimates)
    mean_etc_ratio = etc_ratio.mean(axis=1).fillna(1.0)
    mean_etc_ratio.index = ofv_ratio.index

    # max((abs(indcov - mean(cov))) / sd(cov))
    cov_names = get_model_covariates(model, strings=True)
    if len(cov_names) > 0:
        covariates = model.dataset[cov_names + [idcol]].set_index(idcol)
        mean_covs = covariates.groupby(idcol).mean()
        maxcov = (abs(mean_covs - mean_covs.mean()) / mean_covs.std()).max(axis=1)
    else:
        maxcov = 0.0

    df = pd.DataFrame(
        {
            'nids': nids,
            'nobs': nobs,
            'nobs_subj': nobsi / (nobs / nids),
            'nobsi': nobsi,
            'ncovs': len(cov_names),
            'maxcov': maxcov,
            'max_cwres': abs(cwres).groupby(idcol).max(),
            'median_cwres': abs(cwres).groupby(idcol).median(),
            'max_ebe_ratio': max_ebe_ratio,
            'ofv_ratio': ofv_ratio,
            'etc_ratio': mean_etc_ratio,
            'iofv': iofv,
            'npar': npar,
            'ofv': ofv,
        }
    )
    return df


[docs]def predict_outliers(model, cutoff=3.0):
    """Predict outliers for a model using a machine learning model.

    See the :ref:`simeval <Individual OFV summary>` documentation for a definition of the `residual`

    Parameters
    ----------
    model : Model
        Pharmpy model
    cutoff : float
        Cutoff threshold for a residual singalling an outlier

    Returns
    -------
    pd.Dataframe
        Dataframe over the individuals with a `residual` column containing the raw predicted
        residuals and a `outlier` column with a boolean to tell whether the individual is
        an outlier or not.

    Examples
    --------
    >>> from pharmpy.modeling import *
    >>> model = load_example_model("pheno")
    >>> predict_outliers(model)     # doctest: +SKIP
        residual  outlier
    ID
    1  -0.281443    False
    2   1.057392    False
    3  -0.119105    False
    4  -0.846849    False
    5   0.600540    False
    6   1.014008    False
    7  -0.750734    False
    8   0.247175    False
    9   0.117002    False
    10 -0.835389    False
    11  3.529582     True
    12 -0.035670    False
    13  0.292333    False
    14  0.303278    False
    15 -0.565949    False
    16 -0.078192    False
    17 -0.291295    False
    18  2.466421    False
    19 -0.402343    False
    20 -0.699996    False
    21 -0.567987    False
    22 -0.526776    False
    23  0.303918    False
    24 -0.177588    False
    25  1.272142    False
    26 -0.390000    False
    27  0.775876    False
    28 -0.501528    False
    29 -0.700951    False
    30 -0.352599    False
    31  0.294196    False
    32  0.744014    False
    33 -0.215364    False
    34  0.208691    False
    35  1.713130    False
    36  0.300293    False
    37 -0.810736    False
    38  0.459877    False
    39 -0.675125    False
    40 -0.563835    False
    41 -0.526945    False
    42  4.449199     True
    43  0.720714    False
    44 -0.792248    False
    45 -0.860923    False
    46 -0.731858    False
    47 -0.247131    False
    48  1.894190    False
    49 -0.282737    False
    50 -0.153398    False
    51  1.200546    False
    52  0.902774    False
    53  0.586427    False
    54  0.183329    False
    55  1.036930    False
    56 -0.639868    False
    57 -0.765279    False
    58 -0.209665    False
    59 -0.225693    False

    See also
    --------
    predict_influential_individuals
    predict_influential_outliers

    """
    model_path = Path(__file__).parent.resolve() / 'ml_models' / 'outliers.tflite'
    data = _create_dataset(model)
    output = _predict_with_tflite(model_path, data)
    df = pd.DataFrame({'residual': output, 'outlier': output > cutoff}, index=get_ids(model))
    df.index.name = model.datainfo.id_column.name
    return df


[docs]def predict_influential_individuals(model, cutoff=3.84):
    """Predict influential individuals for a model using a machine learning model.

    Parameters
    ----------
    model : Model
        Pharmpy model
    cutoff : float
        Cutoff threshold for a dofv signalling an influential individual

    Returns
    -------
    pd.Dataframe
        Dataframe over the individuals with a `dofv` column containing the raw predicted
        delta-OFV and an `influential` column with a boolean to tell whether the individual is
        influential or not.

    See also
    --------
    predict_influential_outliers
    predict_outliers

    """

    model_path = Path(__file__).parent.resolve() / 'ml_models' / 'infinds.tflite'
    data = _create_dataset(model)
    output = _predict_with_tflite(model_path, data)
    df = pd.DataFrame({'dofv': output, 'influential': output > cutoff}, index=get_ids(model))
    df.index.name = model.datainfo.id_column.name
    return df


[docs]def predict_influential_outliers(model, outlier_cutoff=3, influential_cutoff=3.84):
    """Predict influential outliers for a model using a machine learning model.

    Parameters
    ----------
    model : Model
        Pharmpy model
    outlier_cutoff : float
        Cutoff threshold for a residual singalling an outlier
    influential_cutoff : float
        Cutoff threshold for a dofv signalling an influential individual

    Returns
    -------
    pd.Dataframe
        Dataframe over the individuals with a `outliers` and `dofv` columns containing the raw
        predictions and `influential`, `outlier` and `influential_outlier` boolean columns.

    See also
    --------
    predict_influential_individuals
    predict_outliers

    """

    outliers = predict_outliers(model, cutoff=outlier_cutoff)
    infinds = predict_influential_individuals(model, cutoff=influential_cutoff)
    df = pd.concat([outliers, infinds], axis=1)
    df['influential_outlier'] = df['outlier'] & df['influential']
    return df


def _predict_with_tflite(model_path, data):
    import tflite_runtime.interpreter as tflite

    interpreter = tflite.Interpreter(str(model_path))
    interpreter.allocate_tensors()
    input_details = interpreter.get_input_details()
    output_details = interpreter.get_output_details()

    npdata = data.astype('float32').values
    nrows = len(data)
    output = np.empty(nrows)

    for i in range(0, nrows):
        interpreter.set_tensor(input_details[0]['index'], npdata[i : (i + 1), :])
        interpreter.invoke()
        output_data = interpreter.get_tensor(output_details[0]['index'])
        output[i] = output_data

    return output