Source code for pharmpy.modeling.data

from __future__ import annotations

import re
import warnings
from pathlib import Path
from typing import Collection, Container, Literal, Optional, Union

from pharmpy.basic import Expr, Unit
from pharmpy.deps import numpy as np
from pharmpy.deps import pandas as pd
from pharmpy.deps import sympy
from pharmpy.deps.rich import box as rich_box
from pharmpy.deps.rich import console as rich_console
from pharmpy.deps.rich import table as rich_table
from pharmpy.internals.fs.path import normalize_user_given_path, path_absolute
from pharmpy.model import (
    Assignment,
    ColumnInfo,
    CompartmentalSystem,
    DataInfo,
    DatasetError,
    Model,
    get_and_check_dataset,
)
from pharmpy.model.model import update_datainfo

from .expressions import get_dv_symbol
from .iterators import resample_data



[docs]
def get_ids(model: Model) -> list[int]:
    """Retrieve a list of all subject ids of the dataset

    Parameters
    ----------
    model : Model
        Pharmpy model

    Returns
    -------
    list
        All subject ids

    Example
    -------
    >>> from pharmpy.modeling import load_example_model, get_ids
    >>> model = load_example_model("pheno")
    >>> get_ids(model)      # doctest: +ELLIPSIS
    [1, 2, 3, ..., 57, 58, 59]
    """
    df = get_and_check_dataset(model)
    idcol = model.datainfo.id_column.name
    ids = list(int(x) for x in df[idcol].unique())
    return ids




[docs]
def get_number_of_individuals(model: Model):
    """Retrieve the number of individuals in the model dataset

    Parameters
    ----------
    model : Model
        Pharmpy model

    Returns
    -------
    int
        Number of individuals in the model dataset

    Examples
    --------
    >>> from pharmpy.modeling import get_number_of_individuals, load_example_model
    >>> model = load_example_model("pheno")
    >>> get_number_of_individuals(model)
    59

    Notes
    -----
    For NONMEM models this is the number of individuals of the active dataset, i.e. after filtering
    of IGNORE and ACCEPT and removal of individuals with no observations.

    See also
    --------
    get_number_of_observations : Get the number of observations in a dataset
    get_number_of_observations_per_individual : Get the number of observations per individual in a
        dataset

    """
    return len(get_ids(model))




[docs]
def get_number_of_observations(model: Model):
    """Retrieve the total number of observations in the model dataset

    Parameters
    ----------
    model : Model
        Pharmpy model

    Returns
    -------
    int
        Number of observations in the model dataset

    Examples
    --------
    >>> from pharmpy.modeling import get_number_of_observations, load_example_model
    >>> model = load_example_model("pheno")
    >>> get_number_of_observations(model)
    155

    Notes
    -----
    For NONMEM models this is the number of observations of the active dataset, i.e. after filtering
    of IGNORE and ACCEPT and removal of individuals with no observations.

    See also
    --------
    get_number_of_individuals : Get the number of individuals in a dataset
    get_number_of_observations_per_individual : Get the number of observations per individual in a
        dataset

    """
    return len(get_observations(model))




[docs]
def get_number_of_observations_per_individual(model: Model):
    """Number of observations for each individual

    Parameters
    ----------
    model : Model
        Pharmpy model

    Returns
    -------
    pd.Series
        Number of observations in the model dataset

    Examples
    --------
    >>> from pharmpy.modeling import get_number_of_observations_per_individual, load_example_model
    >>> model = load_example_model("pheno")
    >>> get_number_of_observations_per_individual(model)
    ID
    1     2
    2     3
    3     3
    4     3
    5     3
    6     3
    7     3
    8     3
    9     4
    10    3
    11    1
    12    3
    13    2
    14    4
    15    2
    16    3
    17    3
    18    4
    19    3
    20    3
    21    3
    22    2
    23    3
    24    3
    25    6
    26    2
    27    2
    28    1
    29    1
    30    2
    31    1
    32    3
    33    2
    34    2
    35    2
    36    3
    37    2
    38    4
    39    3
    40    2
    41    3
    42    2
    43    1
    44    3
    45    3
    46    1
    47    1
    48    5
    49    3
    50    4
    51    3
    52    3
    53    2
    54    4
    55    1
    56    1
    57    2
    58    3
    59    3
    Name: observation_count, dtype: int64

    Notes
    -----
    For NONMEM models this is the individuals and number of observations of the active dataset, i.e.
    after filtering of IGNORE and ACCEPT and removal of individuals with no observations.

    See also
    --------
    get_number_of_individuals : Get the number of individuals in a dataset
    get_number_of_observations_per_individual : Get the number of observations per individual in a
        dataset

    """
    ser = get_observations(model).groupby(model.datainfo.id_column.name).count()
    ser.name = "observation_count"
    return ser




[docs]
def get_observations(
    model: Model, keep_index: bool = False, dv: Union[Expr, str, int, None] = None
) -> pd.Series:
    """Get observations from dataset

    Parameters
    ----------
    model : Model
        Pharmpy model
    keep_index : bool
        Set to True if the original index should be kept.
        Otherwise a new index using ID and idv will be created.
    dv : Union[Expr, str, int, None]
        Name or DVID of dependent variable. None for the default (first or only)

    Returns
    -------
    pd.Series
        Observations indexed over ID and TIME

    Examples
    --------
    >>> from pharmpy.modeling import get_observations, load_example_model
    >>> model = load_example_model("pheno")
    >>> get_observations(model)
        ID  TIME
    1   2.0      17.3
        112.5    31.0
    2   2.0       9.7
        63.5     24.6
        135.5    33.0
                 ...
    58  47.5     27.9
        131.8    31.0
    59  1.8      22.6
        73.8     34.3
        146.8    40.2
    Name: DV, Length: 155, dtype: float64

    See also
    --------
    get_number_of_observations : get the number of observations
    get_number_of_observations_per_individual : get the number of observations per individual
    """
    try:
        label = model.datainfo.typeix['mdv'][0].name
    except IndexError:
        try:
            label = model.datainfo.typeix['event'][0].name
        except IndexError:
            try:
                label = model.datainfo.typeix['dose'][0].name
            except IndexError:
                label = None  # All data records are observations

    idcol = model.datainfo.id_column.name
    idvcol = model.datainfo.idv_column.name
    dvcol = model.datainfo.dv_column.name

    df = get_and_check_dataset(model)

    if len(model.dependent_variables) > 1:
        dv = get_dv_symbol(model, dv)
        dvid = model.dependent_variables[dv]
        dvidlab = model.datainfo.find_single_column_name("dvid")
        df = df.loc[df[dvidlab] == dvid]

    if label:
        df = df.query(f'{label} == 0')
        if df.empty:
            df = df.astype({label: 'float'})
            df = df.query(f'{label} == 0')
    else:
        df = df.copy()

    if not keep_index:
        df = df[[idcol, idvcol, dvcol]]
        try:
            # FIXME: This shouldn't be needed
            df = df.astype({idvcol: np.float64})
        except ValueError:
            # TIME could not be converted to float (e.g. 10:15)
            pass
        df.set_index([idcol, idvcol], inplace=True)
        df = df.squeeze()
    else:
        df = df[dvcol]
    return df




[docs]
def get_baselines(model: Model):
    """Baselines for each subject.

    Baseline is taken to be the first row even if that has a missing value.

    Parameters
    ----------
    model : Model
        Pharmpy model

    Returns
    -------
    pd.DataFrame
        Dataset with the baselines

    Examples
    --------
    >>> from pharmpy.modeling import load_example_model, get_baselines
    >>> model = load_example_model("pheno")
    >>> get_baselines(model)
         TIME  AMT  WGT  APGR   DV  FA1  FA2
    ID
    1    0.0  25.0  1.4   7.0  0.0  1.0  1.0
    2    0.0  15.0  1.5   9.0  0.0  1.0  1.0
    3    0.0  30.0  1.5   6.0  0.0  1.0  1.0
    4    0.0  18.6  0.9   6.0  0.0  1.0  1.0
    5    0.0  27.0  1.4   7.0  0.0  1.0  1.0
    6    0.0  24.0  1.2   5.0  0.0  1.0  1.0
    7    0.0  19.0  1.0   5.0  0.0  1.0  1.0
    8    0.0  24.0  1.2   7.0  0.0  1.0  1.0
    9    0.0  27.0  1.4   8.0  0.0  1.0  1.0
    10   0.0  27.0  1.4   7.0  0.0  1.0  1.0
    11   0.0  24.0  1.2   7.0  0.0  1.0  1.0
    12   0.0  26.0  1.3   6.0  0.0  1.0  1.0
    13   0.0  11.0  1.1   6.0  0.0  1.0  1.0
    14   0.0  22.0  1.1   7.0  0.0  1.0  1.0
    15   0.0  26.0  1.3   7.0  0.0  1.0  1.0
    16   0.0  12.0  1.2   9.0  0.0  1.0  1.0
    17   0.0  22.0  1.1   5.0  0.0  1.0  1.0
    18   0.0  20.0  1.0   5.0  0.0  1.0  1.0
    19   0.0  10.0  1.0   1.0  0.0  1.0  1.0
    20   0.0  24.0  1.2   6.0  0.0  1.0  1.0
    21   0.0  17.5  1.8   7.0  0.0  1.0  1.0
    22   0.0  15.0  1.5   8.0  0.0  1.0  1.0
    23   0.0  60.0  3.1   3.0  0.0  1.0  1.0
    24   0.0  63.0  3.2   2.0  0.0  1.0  1.0
    25   0.0  15.0  0.7   1.0  0.0  1.0  1.0
    26   0.0  70.0  3.5   9.0  0.0  1.0  1.0
    27   0.0  35.0  1.9   5.0  0.0  1.0  1.0
    28   0.0  60.0  3.2   9.0  0.0  1.0  1.0
    29   0.0  20.0  1.0   7.0  0.0  1.0  1.0
    30   0.0  18.0  1.8   8.0  0.0  1.0  1.0
    31   0.0  30.0  1.4   8.0  0.0  1.0  1.0
    32   0.0  70.0  3.6   9.0  0.0  1.0  1.0
    33   0.0  17.0  1.7   8.0  0.0  1.0  1.0
    34   0.0  34.0  1.7   4.0  0.0  1.0  1.0
    35   0.0  25.0  2.5   5.0  0.0  1.0  1.0
    36   0.0  30.0  1.5   5.0  0.0  1.0  1.0
    37   0.0  24.0  1.2   9.0  0.0  1.0  1.0
    38   0.0  26.0  1.3   8.0  0.0  1.0  1.0
    39   0.0  56.0  1.9  10.0  0.0  1.0  1.0
    40   0.0  19.0  1.1   3.0  0.0  1.0  1.0
    41   0.0  34.0  1.7   7.0  0.0  1.0  1.0
    42   0.0  28.0  2.8   9.0  0.0  1.0  1.0
    43   0.0  18.0  0.9   1.0  0.0  1.0  1.0
    44   0.0  14.0  1.4   7.0  0.0  1.0  1.0
    45   0.0  16.0  0.8   2.0  0.0  1.0  1.0
    46   0.0  11.0  1.1   8.0  0.0  1.0  1.0
    47   0.0  40.0  2.6   9.0  0.0  1.0  1.0
    48   0.0  14.0  0.7   8.0  0.0  1.0  1.0
    49   0.0  26.0  1.3   8.0  0.0  1.0  1.0
    50   0.0  20.0  1.1   6.0  0.0  1.0  1.0
    51   0.0  18.0  0.9   9.0  0.0  1.0  1.0
    52   0.0   9.5  0.9   7.0  0.0  1.0  1.0
    53   0.0  17.0  1.7   8.0  0.0  1.0  1.0
    54   0.0  18.0  1.8   8.0  0.0  1.0  1.0
    55   0.0  25.0  1.1   4.0  0.0  1.0  1.0
    56   0.0  12.0  0.6   4.0  0.0  1.0  1.0
    57   0.0  20.0  2.1   6.0  0.0  1.0  1.0
    58   0.0  14.0  1.4   8.0  0.0  1.0  1.0
    59   0.0  22.8  1.1   6.0  0.0  1.0  1.0
    """
    idlab = model.datainfo.id_column.name
    baselines = model.dataset.groupby(idlab).nth(0).set_index(idlab)
    return baselines




[docs]
def set_covariates(model: Model, covariates: Container[str]):
    """Set columns in the dataset to be covariates in the datainfo

    Parameters
    ----------
    model : Model
        Pharmpy model
    covariates : Container
        Column names

    Returns
    -------
    Model
        Updated Pharmpy model
    """
    di = model.datainfo
    newcols = []
    for col in di:
        if col.name in covariates:
            newcol = col.replace(type='covariate')
            newcols.append(newcol)
        else:
            newcols.append(col)
    model = model.replace(datainfo=di.replace(columns=newcols))
    return model.update_source()




[docs]
def set_dvid(model: Model, name: str):
    """Set a column to act as DVID. Replace DVID if one is already set.

    Parameters
    ----------
    model : Model
        Pharmpy model
    name : str
        Name of DVID column

    Returns
    -------
    Model
        Updated Pharmpy model
    """
    di = model.datainfo
    col = di[name]
    if col.type == 'dvid':
        return model

    try:
        curdvid = di.typeix['dvid'][0]
    except IndexError:
        pass
    else:
        curdvid = curdvid.replace(type='unknown')
        di = di.set_column(curdvid)

    col = col.replace(
        type='dvid',
        unit=1,
        scale='nominal',
        continuous=False,
        drop=False,
        descriptor='observation identifier',
    )
    df = model.dataset
    if not col.is_integer():
        ser = df[name]
        converted = pd.to_numeric(ser, downcast='integer')
        if not pd.api.types.is_integer_dtype(converted):
            raise ValueError(
                f"Could not use column {name} as DVID because it contains non-integral values"
            )
        df = df.assign(**{name: converted})
        col = col.replace(datatype=ColumnInfo.convert_pd_dtype_to_datatype(converted.dtype))
        new_dataset = True
    else:
        new_dataset = False

    col = col.replace(categories=sorted(df[name].unique()))

    di = di.set_column(col)

    if new_dataset:
        model = model.replace(datainfo=di, dataset=df)
    else:
        model = model.replace(datainfo=di)
    return model.update_source()




[docs]
def get_covariate_baselines(model: Model):
    """Return a dataframe with baselines of all covariates for each id.

    Baseline is taken to be the first row even if that has a missing value.

    Parameters
    ----------
    model : Model
        Pharmpy model

    Returns
    -------
    pd.DataFrame
        covariate baselines

    See also
    --------
    get_baselines : baselines for all data columns

    Example
    -------
    >>> from pharmpy.modeling import load_example_model, get_covariate_baselines, set_covariates
    >>> model = load_example_model("pheno")
    >>> model = set_covariates(model, ["WGT", "APGR"])
    >>> get_covariate_baselines(model)
        WGT  APGR
    ID
    1   1.4   7.0
    2   1.5   9.0
    3   1.5   6.0
    4   0.9   6.0
    5   1.4   7.0
    6   1.2   5.0
    7   1.0   5.0
    8   1.2   7.0
    9   1.4   8.0
    10  1.4   7.0
    11  1.2   7.0
    12  1.3   6.0
    13  1.1   6.0
    14  1.1   7.0
    15  1.3   7.0
    16  1.2   9.0
    17  1.1   5.0
    18  1.0   5.0
    19  1.0   1.0
    20  1.2   6.0
    21  1.8   7.0
    22  1.5   8.0
    23  3.1   3.0
    24  3.2   2.0
    25  0.7   1.0
    26  3.5   9.0
    27  1.9   5.0
    28  3.2   9.0
    29  1.0   7.0
    30  1.8   8.0
    31  1.4   8.0
    32  3.6   9.0
    33  1.7   8.0
    34  1.7   4.0
    35  2.5   5.0
    36  1.5   5.0
    37  1.2   9.0
    38  1.3   8.0
    39  1.9  10.0
    40  1.1   3.0
    41  1.7   7.0
    42  2.8   9.0
    43  0.9   1.0
    44  1.4   7.0
    45  0.8   2.0
    46  1.1   8.0
    47  2.6   9.0
    48  0.7   8.0
    49  1.3   8.0
    50  1.1   6.0
    51  0.9   9.0
    52  0.9   7.0
    53  1.7   8.0
    54  1.8   8.0
    55  1.1   4.0
    56  0.6   4.0
    57  2.1   6.0
    58  1.4   8.0
    59  1.1   6.0
    """
    covariates = model.datainfo.typeix['covariate'].names
    idlab = model.datainfo.id_column.name
    df = model.dataset[covariates + [idlab]]
    df = df.set_index(idlab)
    return df.groupby(idlab).nth(0)




[docs]
def list_time_varying_covariates(model: Model):
    """Return a list of names of all time varying covariates

    Parameters
    ----------
    model : Model
        Pharmpy model

    Returns
    -------
    list
        Names of all time varying covariates

    See also
    --------
    get_covariate_baselines : get baselines for all covariates

    Example
    -------
    >>> from pharmpy.modeling import load_example_model, list_time_varying_covariates
    >>> model = load_example_model("pheno")
    >>> list_time_varying_covariates(model)
    []

    """
    cov_labels = model.datainfo.typeix['covariate'].names
    if len(cov_labels) == 0:
        return []
    else:
        time_var = (
            model.dataset.groupby(by=model.datainfo.id_column.name)[cov_labels]
            .nunique()
            .gt(1)
            .any()
        )
        return list(time_var.index[time_var])




[docs]
def get_doses(model: Model):
    """Get a series of all doses

    Indexed with ID and TIME

    Parameters
    ----------
    model : Model
        Pharmpy model

    Returns
    -------
    pd.Series
        doses

    Example
    -------
    >>> from pharmpy.modeling import load_example_model, get_doses
    >>> model = load_example_model("pheno")
    >>> get_doses(model)
    ID  TIME
    1   0.0      25.0
        12.5      3.5
        24.5      3.5
        37.0      3.5
        48.0      3.5
                 ...
    59  96.0      3.0
        108.3     3.0
        120.5     3.0
        132.3     3.0
        144.8     3.0
    Name: AMT, Length: 589, dtype: float64

    """
    try:
        label = model.datainfo.typeix['dose'][0].name
    except IndexError:
        raise DatasetError('Could not identify dosing rows in dataset')

    idcol = model.datainfo.id_column.name
    idvcol = model.datainfo.idv_column.name
    df = model.dataset.query(f'{label} != 0')
    df = df[[idcol, idvcol, label]]
    try:
        # FIXME: This shouldn't be needed
        df = df.astype({idvcol: np.float64})
    except ValueError:
        # TIME could not be converted to float (e.g. 10:15)
        pass
    df.set_index([idcol, idvcol], inplace=True)
    return df.squeeze()




[docs]
def expand_additional_doses(model: Model, flag: bool = False):
    """Expand additional doses into separate dose records

    Parameters
    ----------
    model : Model
        Pharmpy model
    flag : bool
        True to add a boolean EXPANDED column to mark added records. In this case all
        columns in the original dataset will be kept. Care needs to be taken to handle
        the new dataset.

    Returns
    -------
    Model
        Updated Pharmpy model
    """
    try:
        addl = model.datainfo.typeix['additional'][0].name
        ii = model.datainfo.typeix['ii'][0].name
    except IndexError:
        return model
    idv = model.datainfo.idv_column.name
    idcol = model.datainfo.id_column.name

    df = get_and_check_dataset(model)

    try:
        event = model.datainfo.typeix['event'][0].name
    except IndexError:
        df = df.assign(_RESETGROUP=1.0)
    else:
        df = df.assign(_FLAG=df[event] >= 3)
        df = df.assign(_RESETGROUP=df.groupby('ID')['_FLAG'].cumsum())
        df = df.drop('_FLAG', axis=1)

    def fn(a):
        if a[addl] == 0:
            times = [a[idv]]
            expanded = [False]
        else:
            length = int(a[addl])
            times = [a[ii] * x + a[idv] for x in range(length + 1)]
            expanded = [False] + [True] * length
        a['_TIMES'] = times
        a['_EXPANDED'] = expanded
        return a

    df = df.apply(fn, axis=1)
    df = df.apply(lambda x: x.explode() if x.name in ['_TIMES', '_EXPANDED'] else x)
    df = df.astype({'_EXPANDED': np.bool_})
    df = df.groupby([idcol, '_RESETGROUP'], group_keys=False)[df.columns].apply(
        lambda x: x.sort_values(by='_TIMES', kind='stable')
    )
    df[idv] = df['_TIMES'].astype(np.float64)
    df.drop(['_TIMES', '_RESETGROUP'], axis=1, inplace=True)
    if flag:
        df.rename(columns={'_EXPANDED': 'EXPANDED'}, inplace=True)
    else:
        df.drop([addl, ii, '_EXPANDED'], axis=1, inplace=True)
    model = model.replace(dataset=df.reset_index(drop=True))
    return model.update_source()




[docs]
def get_doseid(model: Model):
    """Get a DOSEID series from the dataset with an id of each dose period starting from 1

    If a a dose and observation exist at the same time point the observation will be counted
    towards the previous dose.

    Parameters
    ----------
    model : Model
        Pharmpy model

    Returns
    -------
    pd.Series
        DOSEIDs

    Examples
    --------
    >>> from pharmpy.modeling import load_example_model, get_doseid
    >>> model = load_example_model("pheno")
    >>> get_doseid(model)  # doctest: +ELLIPSIS
    0       1
    1       1
    2       2
    3       3
    4       4
           ..
    739    10
    740    11
    741    12
    742    13
    743    13
    Name: DOSEID, Length: 744, dtype: int...
    """
    try:
        dose = model.datainfo.typeix['dose'][0].name
    except IndexError:
        raise DatasetError('Could not identify dosing rows in dataset')

    df = model.dataset.copy()
    df['DOSEID'] = df[dose]
    df.loc[df['DOSEID'] > 0, 'DOSEID'] = 1
    df['DOSEID'] = df['DOSEID'].astype(int)
    idcol = model.datainfo.id_column.name
    df['DOSEID'] = df.groupby(idcol)['DOSEID'].cumsum()

    # Adjust for dose and observation at the same time point
    # Observation is moved to previous dose group
    # Except for steady state dose where the dose group is kept
    try:
        eventcol = model.datainfo.typeix['event'][0].name
    except IndexError:
        df['_RESETGROUP'] = 1.0
    else:
        df['_FLAG'] = df[eventcol] >= 3
        df['_RESETGROUP'] = df.groupby('ID')['_FLAG'].cumsum()

    try:
        ss = model.datainfo.typeix['ss'][0].name
    except IndexError:
        ss = None

    idvcol = model.datainfo.idv_column.name
    ser = df.groupby([idcol, idvcol, '_RESETGROUP']).size()
    nonunique = ser[ser > 1]

    for i, time, _ in nonunique.index:
        groupind = df[(df[idcol] == i) & (df[idvcol] == time)].index
        obsind = df[(df[idcol] == i) & (df[idvcol] == time) & (df[dose] == 0)].index
        doseind = set(groupind) - set(obsind)
        if not doseind:
            continue
        maxind = max(doseind)
        for index in obsind:
            if 0 in groupind:  # This is the first dose
                continue
            if maxind > index:  # Dose record is after the observation
                continue
            if ss and df.loc[maxind, ss] > 0:  # No swap for SS dosing
                continue
            curdoseid = df.loc[index, 'DOSEID']
            df.loc[index, 'DOSEID'] = curdoseid - 1

    return df['DOSEID'].copy()




[docs]
def get_mdv(model: Model):
    """Get MDVs from dataset

    Parameters
    ----------
    model : Model
        Pharmpy model

    Returns
    -------
    pd.Series
        MDVs
    """
    found = False
    for key in ['mdv', 'event', 'dose']:
        try:
            label = model.datainfo.typeix[key][0].name
            found = True
            break
        except IndexError:
            pass
    else:
        label = model.datainfo.dv_column.name

    data = get_and_check_dataset(model)[label].astype('float64').squeeze()

    series = data.where(data == 0, other=1) if found else pd.Series(np.zeros(len(data)))

    return series.astype('int32').rename('MDV')




[docs]
def get_evid(model: Model):
    """Get the evid from model dataset

    If an event column is present this will be extracted otherwise
    an evid column will be created.

    Parameters
    ----------
    model : Model
        Pharmpy model

    Returns
    -------
    pd.Series
        EVID
    """
    di = model.datainfo
    try:
        eventcols = di.typeix['event']
    except IndexError:
        pass
    else:
        return model.dataset[eventcols[0].name]
    mdv = get_mdv(model)
    return mdv.rename('EVID')




[docs]
def get_admid(model: Model) -> pd.Series:
    """Get the admid from model dataset

    If an administration column is present this will be extracted otherwise
    an admid column will be created based on the admids of the present doses.
    This is dependent on the presence of a CMT column to be generated correctly.

    When generated, admids of events in between doses is set to the last used
    admid.

    Parameters
    ----------
    model : Model
        Pharmpy model

    Returns
    -------
    pd.Series
        ADMID
    """
    di = model.datainfo
    try:
        admidcols = di.typeix["admid"]
    except IndexError:
        pass
    else:
        return model.dataset[admidcols[0].name]

    odes = model.statements.ode_system
    names = odes.compartment_names
    remap = {}
    if isinstance(odes, CompartmentalSystem):
        for dosing in odes.dosing_compartments:
            remap[names.index(dosing.name) + 1] = dosing.doses[0].admid
    adm = get_cmt(model)
    adm = adm.replace(remap)
    adm.name = "ADMID"

    # Replace all observations with the previous admid type
    current_admin = adm[0]
    current_subject = model.dataset["ID"][0]
    for i, data in enumerate(zip(get_evid(model), adm, model.dataset["ID"])):
        event = data[0]
        admin = data[1]
        subject = data[2]
        if current_subject == subject:
            if event == 1:
                current_admin = admin
            if event != 1:
                if current_admin is not None:
                    adm[i] = current_admin
        else:
            current_subject = subject
            current_admin = admin
    return adm




[docs]
def add_admid(model: Model):
    """
    Add an admid column to the model dataset and datainfo. Dependent on the
    presence of a CMT column in order to add admid correctly.

    When generated, admids of events in between doses is set to the last used
    admid.

    Parameters
    ----------
    model : Model
        Pharmpy model

    Returns
    -------
    model : Model
        Updated Pharmpy model

    See also
    --------
    get_admid : Get or create an admid column
    get_cmt : Get or create a cmt column
    """
    di = model.datainfo
    if "admid" not in di.types:
        adm = get_admid(model)
        dataset = model.dataset
        dataset["ADMID"] = adm
        di = update_datainfo(model.datainfo, dataset)
        colinfo = di['ADMID'].replace(type='admid')
        model = model.replace(datainfo=di.set_column(colinfo), dataset=dataset)

    return model.update_source()



def set_admid(model: Model, column_name: str):
    """
    Set the specified column in the dataset to the admid data type.

    Parameters
    ----------
    model : Model
        Pharmpy model
    column_name : str
        name of column to set as admid

    Returns
    -------
    model : Model
        Updated Pharmpy model

    See also
    --------
    get_admid : Get or create an admid column
    add_admid : Add an admid column to the dataset
    """
    di = model.datainfo
    colinfo = di[column_name].replace(type="admid")
    model = model.replace(datainfo=di.set_column(colinfo))
    return model.update_source()



[docs]
def get_cmt(model: Model):
    """Get the cmt (compartment) column from the model dataset

    If a cmt column is present this will be extracted otherwise
    a cmt column will be created. If created, multiple dose compartments are
    dependent on the presence of an admid type column, otherwise, dose/non-dose
    will be considered.

    Parameters
    ----------
    model : Model
        Pharmpy model

    Returns
    -------
    pd.Series
        CMT
    """
    di = model.datainfo
    try:
        cmtcols = di.typeix['compartment']
    except IndexError:
        pass
    else:
        return model.dataset[cmtcols[0].name]

    # See if admid exist
    try:
        admidcols = di.typeix["admid"]
    except IndexError:
        # No admid found --> Assume dose/non-dose
        odes = model.statements.ode_system
        if isinstance(odes, CompartmentalSystem):
            try:
                dosing = odes.dosing_compartments[0]
            except ValueError:  # No dosing
                dose_cmt = 1
            else:
                names = odes.compartment_names
                dose_cmt = names.index(dosing.name) + 1
        else:
            dose_cmt = 1
        cmt = get_evid(model)
        cmt = cmt.replace({1: dose_cmt, 2: 0, 3: 0, 4: dose_cmt})  # Only consider dose/non-dose
        cmt.name = "CMT"
        return cmt
    else:
        admidcols = model.dataset[admidcols[0].name]
        # Admid found -> convert to CMT based on doses
        odes = model.statements.ode_system
        names = odes.compartment_names
        remap = {}
        if isinstance(odes, CompartmentalSystem):
            for dosing in odes.dosing_compartments:
                if dosing == odes.central_compartment:
                    remap[2] = names.index(dosing.name) + 1
                    central_number = names.index(dosing.name) + 1
                else:
                    remap[1] = names.index(dosing.name) + 1
        admidcols = admidcols.replace(remap)
        admidcols.loc[get_evid(model) == 0] = central_number
        admidcols.name = "ADMID"
        return admidcols




[docs]
def add_cmt(model: Model):
    """Add a CMT column to the model dataset and datainfo if not existed

    In case of multiple doses, this method is dependent on the presence of an
    admid column to correctly number each dose.

    NOTE : Existing CMT is based on datainfo type being set to 'compartment'
    and a column named 'CMT' can be replaced

    Parameters
    ----------
    model : Model
        Pharmpy model

    Returns
    -------
    model : Model
        Updated Pharmpy model

    See also
    --------
    get_admid : Get or create an admid column
    get_cmt : Get or create a cmt column
    """
    di = model.datainfo
    if "compartment" not in di.types:
        cmt_name = "CMT"
        cmt = get_cmt(model)
        dataset = model.dataset
        dataset[cmt_name] = cmt
        di = update_datainfo(model.datainfo, dataset)
        colinfo = di[cmt_name].replace(type='compartment')
        model = model.replace(datainfo=di.set_column(colinfo), dataset=dataset)

    return model.update_source()




[docs]
def add_time_of_last_dose(model: Model, name: str = "TDOSE"):
    """Add a variable for time of last dose to the model

    Parameters
    ----------
    model : Model
        Pharmpy model
    name : str
        Name of time of last dose variable

    Returns
    -------
    Model
        Updated Pharmpy model

    See also
    --------
    add_time_after_dose : Add time after dose to dataset
    """
    idv = model.datainfo.idv_column.symbol
    amt = model.datainfo.typeix['dose'][0].symbol
    expr = Expr.forward(idv, amt > 0)
    symbol = Expr.symbol(name)
    assignment = Assignment(symbol, expr)
    statements = assignment + model.statements
    model = model.replace(statements=statements)
    return model.update_source()




[docs]
def add_time_after_dose(model: Model):
    """Calculate and add a TAD column to the dataset

    Parameters
    ----------
    model : Model
        Pharmpy model

    Returns
    -------
    Model
        Updated Pharmpy model

    Examples
    --------
    >>> from pharmpy.modeling import load_example_model, add_time_after_dose
    >>> model = load_example_model("pheno")
    >>> model = add_time_after_dose(model)

    See also
    --------
    add_time_of_last_dose : Add time of last dose to model
    """

    try:
        model.datainfo.descriptorix['time after dose']
    except IndexError:
        pass
    else:
        # Already have time after dose
        return model
    temp = translate_nmtran_time(model)
    idv = temp.datainfo.idv_column.name
    idlab = temp.datainfo.id_column.name
    df = get_and_check_dataset(model).copy()
    df['_NEWTIME'] = temp.dataset[idv]

    try:
        addl = temp.datainfo.typeix['additional'][0].name
    except IndexError:
        addl = None
    else:
        # FIXME: Temp workaround, should be canonicalized in Model.replace
        di = update_datainfo(temp.datainfo, df)
        new_idvcol = di.idv_column.replace(type='unknown')
        new_timecol = di['_NEWTIME'].replace(type='idv')
        di = di.set_column(new_idvcol).set_column(new_timecol)
        temp = temp.replace(datainfo=di, dataset=df)
        temp = expand_additional_doses(temp, flag=True)
        df = temp.dataset

    df['_DOSEID'] = get_doseid(temp)

    # Sort in case DOSEIDs are non-increasing
    df = (
        df.groupby(idlab)[df.columns]
        .apply(lambda x: x.sort_values(by=['_DOSEID'], kind='stable', ignore_index=True))
        .reset_index(drop=True)
    )

    df['TAD'] = df.groupby([idlab, '_DOSEID'])['_NEWTIME'].diff().fillna(0.0)
    df['TAD'] = df.groupby([idlab, '_DOSEID'])['TAD'].cumsum()

    if addl:
        df = df[~df['EXPANDED']].reset_index(drop=True)
        df.drop(columns=['EXPANDED'], inplace=True)

    df.drop(columns=['_NEWTIME', '_DOSEID'], inplace=True)

    # FIXME: Temp workaround, should be canonicalized in Model.replace
    di = update_datainfo(model.datainfo, df)
    colinfo = di['TAD'].replace(descriptor='time after dose', unit=di[idv].unit)
    model = model.replace(datainfo=di.set_column(colinfo), dataset=df)
    return model.update_source()




[docs]
def get_concentration_parameters_from_data(model: Model):
    """Create a dataframe with concentration parameters

    Note that all values are directly calculated from the dataset

    Parameters
    ----------
    model : Model
        Pharmpy model

    Returns
    -------
    pd.DataFrame
        Concentration parameters

    Examples
    --------
    >>> from pharmpy.modeling import load_example_model, get_concentration_parameters_from_data
    >>> model = load_example_model("pheno")
    >>> get_concentration_parameters_from_data(model)
               Cmax  Tmax  Cmin  Tmin
    ID DOSEID
    1  1       17.3   2.0   NaN   NaN
       2        NaN   NaN   NaN   NaN
       3        NaN   NaN   NaN   NaN
       4        NaN   NaN   NaN   NaN
       5        NaN   NaN   NaN   NaN
    ...         ...   ...   ...   ...
    59 9        NaN   NaN   NaN   NaN
       10       NaN   NaN   NaN   NaN
       11       NaN   NaN   NaN   NaN
       12       NaN   NaN   NaN   NaN
       13      40.2   2.0   NaN   NaN
    <BLANKLINE>
    [589 rows x 4 columns]
    """
    model = add_time_after_dose(model)
    doseid = get_doseid(model)
    df = get_and_check_dataset(model).copy()
    df['DOSEID'] = doseid
    idlab = model.datainfo.id_column.name
    dv = model.datainfo.dv_column.name
    noobs = df.groupby([idlab, 'DOSEID']).size() == 1
    idx = df.groupby([idlab, 'DOSEID'])[dv].idxmax()
    params = df.loc[idx].set_index([idlab, 'DOSEID'])
    params = params[[dv, 'TAD']]
    params.rename(columns={dv: 'Cmax', 'TAD': 'Tmax'}, inplace=True)
    params.loc[noobs] = np.nan

    grpind = df.groupby(['ID', 'DOSEID']).indices
    keep = []
    for ind, rows in grpind.items():
        index = idx.loc[ind]
        p = params.loc[ind]
        if not np.isnan(p['Tmax']):
            keep += [row for row in rows if row > index]
    minidx = df.iloc[keep].groupby([idlab, 'DOSEID'])[dv].idxmin()
    params2 = df.loc[minidx].set_index([idlab, 'DOSEID'])
    params2 = params2[[dv, 'TAD']]
    params2.rename(columns={dv: 'Cmin', 'TAD': 'Tmin'}, inplace=True)
    res = params.join(params2)
    return res




[docs]
def drop_dropped_columns(model: Model):
    """Drop columns marked as dropped from the dataset

    NM-TRAN date columns will not be dropped by this function
    even if marked as dropped.
    Columns not specified in the datainfo ($INPUT for NONMEM)
    will also be dropped from the dataset.

    Parameters
    ----------
    model : Model
        Pharmpy model

    Returns
    -------
    Model
        Updated Pharmpy model

    Example
    -------
    >>> from pharmpy.modeling import *
    >>> model = load_example_model("pheno")
    >>> model = drop_dropped_columns(model)
    >>> list(model.dataset.columns)
    ['ID', 'TIME', 'AMT', 'WGT', 'APGR', 'DV', 'FA1', 'FA2']

    See also
    --------
    drop_columns : Drop specific columns or mark them as drop
    """
    datainfo = model.datainfo
    todrop = [
        colname
        for colname in datainfo.names
        if datainfo[colname].drop and datainfo[colname].datatype != 'nmtran-date'
    ]
    df = get_and_check_dataset(model)
    todrop += list(set(df.columns) - set(datainfo.names))
    model = drop_columns(model, todrop)
    return model.update_source()




[docs]
def drop_columns(model: Model, column_names: Union[list[str], str], mark: bool = False):
    """Drop columns from the dataset or mark as dropped

    Parameters
    ----------
    model : Model
        Pharmpy model
    column_names : list or str
        List of column names or one column name to drop or mark as dropped
    mark : bool
        Default is to remove column from dataset. Set this to True to only mark as dropped

    Returns
    -------
    Model
        Updated Pharmpy model

    Example
    -------
    >>> from pharmpy.modeling import *
    >>> model = load_example_model("pheno")
    >>> model = drop_columns(model, ['WGT', 'APGR'])
    >>> list(model.dataset.columns)
    ['ID', 'TIME', 'AMT', 'DV', 'FA1', 'FA2']

    See also
    --------
    drop_dropped_columns : Drop all columns marked as drop
    undrop_columns : Undrop columns of model
    """
    if isinstance(column_names, str):
        column_names = [column_names]
    di = model.datainfo
    newcols, to_drop = [], []
    for col in di:
        if col.name in column_names:
            if mark:
                newcol = col.replace(drop=True)
                newcols.append(newcol)
            else:
                to_drop.append(col.name)
        else:
            newcols.append(col)
    replace_dict = {'datainfo': di.replace(columns=newcols)}
    if to_drop:
        df = get_and_check_dataset(model).copy()
        replace_dict['dataset'] = df.drop(to_drop, axis=1)
    model = model.replace(**replace_dict)
    return model.update_source()




[docs]
def undrop_columns(model: Model, column_names: Union[list[str], str]):
    """Undrop columns of model

    Parameters
    ----------
    model : Model
        Pharmpy model
    column_names : list or str
        List of column names or one column name to undrop

    Returns
    -------
    Model
        Updated Pharmpy model

    Example
    -------
    >>> from pharmpy.modeling import *
    >>> model = load_example_model("pheno")
    >>> model = drop_columns(model, ['WGT', 'APGR'], mark=True)
    >>> model = undrop_columns(model, 'WGT')

    See also
    --------
    drop_dropped_columns : Drop all columns marked as drop
    drop_columns : Drop or mark columns as dropped
    """
    if isinstance(column_names, str):
        column_names = [column_names]
    di = model.datainfo
    newcols = []
    for col in di:
        if col.name in column_names:
            newcol = col.replace(drop=False)
            newcols.append(newcol)
        else:
            newcols.append(col)
    model = model.replace(datainfo=di.replace(columns=newcols))
    return model.update_source()



def _translate_nonmem_time_value(time):
    if ':' in time:
        components = time.split(':')
        if len(components) != 2:
            raise DatasetError(f'Bad TIME format: {time}')
        hours = float(components[0]) + float(components[1]) / 60
        return hours
    else:
        return float(time)


def _translate_time_column(df, timecol, idcol):
    if df[timecol].dtype != np.float64:
        df[timecol] = df[timecol].apply(_translate_nonmem_time_value)
        df[timecol] = df[timecol] - df.groupby(idcol)[timecol].transform('first')
    return df


def _translate_nonmem_time_and_date_value(ser, timecol, datecol):
    timeval = _translate_nonmem_time_value(ser[timecol])
    date = ser[datecol]
    a = re.split(r'[^0-9]', date)
    if date.startswith('-') or len(a) == 1:
        return timeval + float(date) * 24
    elif len(a) == 2:
        year = 2001  # Non leap year
        month = a[1]
        day = a[0]
    elif len(a) == 3:
        if datecol.endswith('E'):
            month = a[0]
            day = a[1]
            year = a[2]
        elif datecol.endswith('1'):
            day = a[0]
            month = a[1]
            year = a[2]
        elif datecol.endswith('3'):
            year = a[0]
            day = a[1]
            month = a[2]
        else:  # Let DAT2 be default if other name
            year = a[0]
            month = a[1]
            day = a[2]
        if len(year) < 3:
            year = int(year)
            if year > 50:
                year += 1900
            else:
                year += 2000
        else:
            year = int(year)
        month = int(month)
        day = int(day)
        hour = int(timeval)
        timeval = (timeval - hour) * 60
        minute = int(timeval)
        timeval = (timeval - minute) * 60
        second = int(timeval)
        timeval = (timeval - second) * 1000000
        microsecond = int(timeval)
        timeval = (timeval - microsecond) * 1000
        nanosecond = int(timeval)
        ts = pd.Timestamp(
            year=year,
            month=month,
            day=day,
            hour=hour,
            minute=minute,
            second=second,
            microsecond=microsecond,
            nanosecond=nanosecond,
        )
        return ts
    else:
        raise DatasetError(f'Bad DATE value: {date}')


def _translate_time_and_date_columns(df, timecol, datecol, idcol):
    df[timecol] = df.apply(
        _translate_nonmem_time_and_date_value, axis=1, timecol=timecol, datecol=datecol
    )
    timediff = df[timecol] - df.groupby(idcol)[timecol].transform('first')
    if df[timecol].dtype != np.float64:
        df[timecol] = timediff.dt.total_seconds() / 3600
    return df


def _find_time_and_date_columns(model):
    # Both time and date can be None. If date is None time must be not None
    time = None
    date = None
    di = model.datainfo
    for col in di:
        if col.datatype == 'nmtran-time' and not col.drop:
            if time is None:
                time = col
            else:
                raise ValueError(f"Multiple time columns found {time} and {col.name}")
        elif col.datatype == 'nmtran-date' and not col.drop:
            if date is None:
                date = col
            else:
                raise ValueError(f"Multiple date columns found {date} and {col.name}")
    if time is None and date is not None:
        raise ValueError(f"Found date column {date}, but no time column")
    return time, date



[docs]
def translate_nmtran_time(model: Model):
    """Translate NM-TRAN TIME and DATE column into one TIME column

    If dataset of model have special NM-TRAN TIME and DATE columns these
    will be translated into one single time column with time in hours.

    Warnings
    --------
    Use this function with caution. For example reset events are currently not taken into account.

    Parameters
    ----------
    model : Model
        Pharmpy model

    Returns
    -------
    Model
        Updated Pharmpy model
    """
    if model.dataset is None:
        return model
    timecol, datecol = _find_time_and_date_columns(model)
    df = model.dataset.copy()
    di = model.datainfo
    idname = di.id_column.name
    if datecol is None:
        if timecol is None:
            return model
        else:
            df = _translate_time_column(df, timecol.name, idname)
    else:
        assert timecol is not None
        df = _translate_time_and_date_columns(df, timecol.name, datecol.name, idname)
        model = drop_columns(model, datecol.name)
        timecol = timecol.replace(unit='h')
    timecol = timecol.replace(datatype='float64')
    di = di.set_column(timecol)
    model = model.replace(datainfo=di, dataset=df)
    return model.update_source()



def _loq_mask(
    model: Model,
    lloq: Optional[Union[float, str]] = None,
    uloq: Optional[Union[float, str]] = None,
    blq: Optional[str] = None,
    alq: Optional[str] = None,
):
    """Boolean series with False for lloq records and True for non-lloq
    Options as remove_loq_data
    """
    if blq and lloq:
        raise ValueError("Cannot specify blq and lloq at the same time")
    if alq and uloq:
        raise ValueError("Cannot specify alq and uloq at the same time")
    df = get_and_check_dataset(model)
    if lloq is not None or uloq is not None:
        dv = model.datainfo.dv_column.name
    mdv = get_mdv(model)
    which_keep = pd.Series(True, index=df.index)
    if isinstance(lloq, str):
        lloq = df[lloq]
    if isinstance(uloq, str):
        uloq = df[uloq]
    if lloq is not None:
        which_keep &= (df[dv] > lloq) | mdv
    elif blq is not None:
        which_keep &= (df[blq] == 0) | mdv
    if uloq is not None:
        which_keep &= (df[dv] < uloq) | mdv
    elif alq is not None:
        which_keep &= (df[alq] == 0) | mdv
    return which_keep



[docs]
def remove_loq_data(
    model: Model,
    lloq: Optional[Union[float, str]] = None,
    uloq: Optional[Union[float, str]] = None,
    blq: Optional[str] = None,
    alq: Optional[str] = None,
    keep: int = 0,
):
    """Remove loq data records from the dataset

    Does nothing if none of the limits are specified.

    Parameters
    ----------
    model : Model
        Pharmpy model
    lloq : float or str
        Value or column name for lower limit of quantification.
    uloq : float or str
        Value or column name for upper limit of quantification.
    blq : str
        Column name for below limit of quantification indicator.
    alq : str
        Column name for above limit of quantification indicator.
    keep : int
        Number of loq records to keep for each run of consecutive loq records.

    Returns
    -------
    Model
        Updated Pharmpy model

    Examples
    --------
    >>> from pharmpy.modeling import *
    >>> model = load_example_model("pheno")
    >>> model = remove_loq_data(model, lloq=10, uloq=40)
    >>> len(model.dataset)
    736

    See also
    --------
    set_lloq_data
    transform_blq

    """
    which_keep = _loq_mask(model, lloq=lloq, uloq=uloq, blq=blq, alq=alq)
    df = get_and_check_dataset(model)
    if keep > 0:
        idcol = model.datainfo.id_column.name
        keep_df = pd.DataFrame(
            {'ID': df[idcol], 'consec': (~which_keep).diff().ne(0).cumsum(), 'remove': ~which_keep}
        )
        obj = keep_df.groupby([idcol, 'consec']).cumsum().le(keep)['remove']
        which_keep = obj | which_keep
    model = model.replace(dataset=df[which_keep])
    return model.update_source()




[docs]
def set_lloq_data(
    model: Model,
    value: Union[str, float, Expr],
    lloq: Optional[Union[float, str]] = None,
    blq: Optional[str] = None,
):
    """Set a dv value for lloq data records

    Parameters
    ----------
    model : Model
        Pharmpy model
    value : float or Expr
        The new dv value
    lloq : float or str
        Value or column name for lower limit of quantification.
    blq : str
        Column name for below limit of quantification indicator.

    Returns
    -------
    Model
        Updated Pharmpy model

    Examples
    --------
    >>> from pharmpy.modeling import *
    >>> model = load_example_model("pheno")
    >>> model = set_lloq_data(model, 0, lloq=10)

    See also
    --------
    remove_loq_data
    transform_blq

    """
    which_keep = _loq_mask(model, lloq=lloq, blq=blq)
    df = model.dataset.copy()
    dv = model.datainfo.dv_column.name
    if isinstance(value, Expr) or isinstance(value, str):
        value = df.eval(str(value))
    df[dv] = df[dv].where(which_keep, value)
    model = model.replace(dataset=df)
    return model




[docs]
def set_reference_values(model: Model, refs: dict[str, Union[int, float]]):
    """Set reference values for selected columns

        All values for each selected column will be replaced. For dose columns
        only the values for dosing events will be replaced.

    Parameters
    ----------
    model : Model
        Pharmpy model
    refs : dict
        Pairs of column names and reference values

    Returns
    -------
    Model
        Updated Pharmpy model

    Examples
    --------
    >>> from pharmpy.modeling import *
    >>> model = load_example_model("pheno")
    >>> model = set_reference_values(model, {'WGT': 0.5, 'AMT': 4.0})
    >>> model.dataset
         ID   TIME  AMT  WGT  APGR    DV  FA1  FA2
    0     1    0.0  4.0  0.5   7.0   0.0  1.0  1.0
    1     1    2.0  0.0  0.5   7.0  17.3  0.0  0.0
    2     1   12.5  4.0  0.5   7.0   0.0  1.0  1.0
    3     1   24.5  4.0  0.5   7.0   0.0  1.0  1.0
    4     1   37.0  4.0  0.5   7.0   0.0  1.0  1.0
    ..   ..    ...  ...  ...   ...   ...  ...  ...
    739  59  108.3  4.0  0.5   6.0   0.0  1.0  1.0
    740  59  120.5  4.0  0.5   6.0   0.0  1.0  1.0
    741  59  132.3  4.0  0.5   6.0   0.0  1.0  1.0
    742  59  144.8  4.0  0.5   6.0   0.0  1.0  1.0
    743  59  146.8  0.0  0.5   6.0  40.2  0.0  0.0
    <BLANKLINE>
    [744 rows x 8 columns]

    """
    df = model.dataset
    di = model.datainfo
    newcols = dict()
    dtypes = dict()
    for colname, value in refs.items():
        if di[colname].type == 'dose':
            newdose = df[colname].mask(df[colname] > 0, value)
            newcols[colname] = newdose
        else:
            newcols[colname] = value
        datatype = ColumnInfo.convert_datatype_to_pd_dtype(di[colname].datatype)
        dtypes[colname] = datatype
    df = df.assign(**newcols).astype(dtypes)
    model = model.replace(dataset=df)
    return model




[docs]
def infer_datatypes(model: Model, columns: Optional[Collection[str]] = None) -> Model:
    """Infer and set datatypes for the dataset

        All columns or only the ones set in the columns option will be checked
        for conversion to a simpler datatype. Currently only int32 will be checked since
        this corresponds to the integer type in R.

    Parameters
    ----------
    model : Model
        Pharmpy model
    columns : list[str]
        The columns to attempt conversion or None for all

    Returns
    -------
    Model
        Updated Pharmpy model

    Examples
    --------
    >>> from pharmpy.modeling import *
    >>> model = load_example_model("pheno")
    >>> model = infer_datatypes(model, ['APGR'])
    >>> model.dataset
         ID   TIME   AMT  WGT  APGR    DV  FA1  FA2
    0     1    0.0  25.0  1.4     7   0.0  1.0  1.0
    1     1    2.0   0.0  1.4     7  17.3  0.0  0.0
    2     1   12.5   3.5  1.4     7   0.0  1.0  1.0
    3     1   24.5   3.5  1.4     7   0.0  1.0  1.0
    4     1   37.0   3.5  1.4     7   0.0  1.0  1.0
    ..   ..    ...   ...  ...   ...   ...  ...  ...
    739  59  108.3   3.0  1.1     6   0.0  1.0  1.0
    740  59  120.5   3.0  1.1     6   0.0  1.0  1.0
    741  59  132.3   3.0  1.1     6   0.0  1.0  1.0
    742  59  144.8   3.0  1.1     6   0.0  1.0  1.0
    743  59  146.8   0.0  1.1     6  40.2  0.0  0.0
    <BLANKLINE>
    [744 rows x 8 columns]
    """

    if columns is None:
        columns = model.dataset.columns
    df = model.dataset
    if df is None:
        return model

    di = model.datainfo
    changed = False

    for col in columns:
        if pd.api.types.is_float_dtype(df[col]):
            asint = df[col].astype('int32')
            if all(asint == df[col]):
                changed = True
                df = df.assign(**{col: asint})
                ci = di[col]
                ci = ci.replace(datatype='int32')
                di = di.set_column(ci)

    if changed:
        model = model.replace(dataset=df, datainfo=di)

    return model



class Checker:
    _all_checks = (
        ('A1', 'Body weight has unit'),
        ('A2', 'Body weight has mass unit'),
        ('A3', 'Body weight >0 and <700kg'),
        ('A4', 'Age has unit'),
        ('A5', 'Age has time unit'),
        ('A6', 'Age >=0 and <130 years'),
        ('A7', 'Lean body mass has unit'),
        ('A8', 'Lean body mass has mass unit'),
        ('A9', 'Lean body mass >0 and <700kg'),
        ('A10', 'Fat free mass has unit'),
        ('A11', 'Fat free mass has mass unit'),
        ('A12', 'Fat free mass >0 and <700kg'),
        ('D1', 'Time after dose has unit'),
        ('D2', 'Time after dose has time unit'),
        ('D3', 'Time after dose >=0'),
        ('D4', 'Plasma concentration has unit'),
        ('D5', 'Plasma concentration has mass/volume unit'),
        ('D6', 'Plasma concentration >= 0'),
        ('I1', 'Subject identifier is unitless'),
    )

    def __init__(self, datainfo, dataset, verbose=False):
        self.datainfo = datainfo
        self.dataset = dataset
        self.verbose = verbose
        self.check_results = {}
        self.violations = []

    def set_result(self, code, test=False, violation=None, skip=False, warn=False):
        if skip:
            result = "SKIP"
        elif test:
            result = "OK"
        else:
            if warn:
                result = "WARN"
            else:
                result = "FAIL"
        if code not in self.check_results or (
            code in self.check_results
            and (
                self.check_results[code] == 'SKIP'
                or self.check_results[code] == 'OK'
                and result in ('WARN', 'FAIL')
                or self.check_results[code] == 'WARN'
                and result == 'FAIL'
            )
        ):
            self.check_results[code] = result
        if result in ('WARN', 'FAIL'):
            self.violations.append((code, result, violation))

    def check_has_unit(self, code, col):
        has_unit = col.unit is not None
        self.set_result(code, test=has_unit, violation=col.name, warn=True)
        return has_unit

    def check_is_unitless(self, code, col):
        is_unitless = col.unit == Unit.unitless()
        self.set_result(code, test=is_unitless, violation=col.name, warn=True)

    def check_dimension(self, code, column, dim):
        if column.unit is None:
            self.set_result(code, skip=True)
            return False
        else:
            dim2 = sympy.physics.units.Dimension(
                sympy.physics.units.si.SI.get_dimensional_expr(column.unit._expr)
            )
            self.set_result(
                code,
                test=dim == dim2,
                violation=f"Unit {column.unit} of {column.name} is not a {dim} unit",
            )
            return dim == dim2

    def check_range(self, code, col, lower, upper, unit, lower_included=True, upper_included=True):
        name = col.name
        if lower == 0:
            scaled_lower = lower
        else:
            scaled_lower = float(
                sympy.physics.units.convert_to(lower * unit, col.unit._expr) / col.unit._expr
            )
        if upper == 0:
            scaled_upper = upper
        else:
            scaled_upper = float(
                sympy.physics.units.convert_to(upper * unit, col.unit._expr) / col.unit._expr
            )
        if lower_included:
            lower_viol = self.dataset[name] < scaled_lower
        else:
            lower_viol = self.dataset[name] <= scaled_lower
        if upper_included:
            upper_viol = self.dataset[name] > scaled_upper
        else:
            upper_viol = self.dataset[name] >= scaled_upper
        all_viol = lower_viol | upper_viol
        violations = all_viol[all_viol]
        if not violations.empty:
            for i in violations.index:
                self.set_result(
                    code,
                    test=False,
                    violation=f"{col.name} index={i} value={self.dataset[name].loc[i]}",
                )
        else:
            self.set_result(code, test=True)

    def get_dataframe(self):
        codes = []
        checks = []
        results = []
        violations = []

        for code, msg in Checker._all_checks:
            if code not in self.check_results:
                self.check_results[code] = "SKIP"

            if self.check_results[code] in ['OK', 'SKIP']:
                if self.verbose:
                    codes.append(code)
                    checks.append(msg)
                    results.append(self.check_results[code])
                    violations.append(None)
            else:
                for viol in self.violations:
                    if (
                        viol[0] == code
                        and viol[1] == "FAIL"
                        or (viol[1] == "WARN" and self.verbose)
                    ):
                        codes.append(code)
                        checks.append(msg)
                        results.append(viol[1])
                        violations.append(viol[2])
        df = pd.DataFrame(
            {'code': codes, 'check': checks, 'result': results, 'violation': violations}
        )
        return df

    def print(self):
        table = rich_table.Table(title="Dataset checks", box=rich_box.SQUARE)
        table.add_column("Code")
        table.add_column("Check")
        table.add_column("Result")
        table.add_column("Violation")

        for code, msg in Checker._all_checks:
            if code not in self.check_results:
                self.check_results[code] = "SKIP"

            if self.check_results[code] in ['OK', 'SKIP']:
                if self.verbose:
                    table.add_row(code, msg, f'[bold green]{self.check_results[code]}', "")
            else:
                for viol in self.violations:
                    if (
                        viol[0] == code
                        and viol[1] == "FAIL"
                        or (viol[1] == "WARN" and self.verbose)
                    ):
                        result = viol[1]
                        if result == "FAIL":
                            result = f"[bold red]{result}"
                        else:
                            result = f"[bold yellow]{result}"
                        table.add_row(code, msg, result, viol[2])

        if table.rows:  # Do not print an empty table
            console = rich_console.Console()
            console.print(table)



[docs]
def check_dataset(model: Model, dataframe: bool = False, verbose: bool = False):
    """Check dataset for consistency across a set of rules

    Parameters
    ----------
    model : Model
        Pharmpy model
    dataframe : bool
        True to return a DataFrame instead of printing to the console
    verbose : bool
        Print out all rules checked if True else print only failed rules

    Returns
    -------
    pd.DataFrame
        Only returns a DataFrame is dataframe=True
    """
    di = model.datainfo
    df = model.dataset
    checker = Checker(di, df, verbose=verbose)

    for col in di:
        if col.descriptor == "body weight":
            checker.check_has_unit("A1", col)
            samedim = checker.check_dimension("A2", col, sympy.physics.units.mass)
            if samedim:
                checker.check_range("A3", col, 0, 700, sympy.physics.units.kg, False, False)

        if col.descriptor == "age":
            checker.check_has_unit("A4", col)
            samedim = checker.check_dimension("A5", col, sympy.physics.units.time)
            if samedim:
                checker.check_range("A6", col, 0, 130, sympy.physics.units.year, True, False)

        if col.descriptor == "lean body mass":
            checker.check_has_unit("A7", col)
            samedim = checker.check_dimension("A8", col, sympy.physics.units.mass)
            if samedim:
                checker.check_range("A9", col, 0, 700, sympy.physics.units.kg, False, False)

        if col.descriptor == "fat free mass":
            checker.check_has_unit("A10", col)
            samedim = checker.check_dimension("A11", col, sympy.physics.units.mass)
            if samedim:
                checker.check_range("A12", col, 0, 700, sympy.physics.units.kg, False, False)

        if col.descriptor == "time after dose":
            checker.check_has_unit("D1", col)
            samedim = checker.check_dimension("D2", col, sympy.physics.units.time)
            if samedim:
                checker.check_range(
                    "D3", col, 0, float('inf'), sympy.physics.units.second, True, False
                )

        if col.descriptor == "plasma concentration":
            checker.check_has_unit("D4", col)
            samedim = checker.check_dimension(
                "D5", col, sympy.physics.units.mass / sympy.physics.units.length**3
            )
            if samedim:
                checker.check_range(
                    "D6",
                    col,
                    0,
                    float('inf'),
                    sympy.physics.units.kg / sympy.physics.units.L,
                    True,
                    False,
                )

        if col.descriptor == "subject identifier":
            checker.check_is_unitless("I1", col)

    if dataframe:
        return checker.get_dataframe()
    else:
        checker.print()




[docs]
def read_dataset_from_datainfo(
    datainfo: Union[DataInfo, Path, str], datatype: Optional[str] = None
):
    """Read a dataset given a datainfo object or path to a datainfo file

    Parameters
    ----------
    datainfo : DataInfo | Path | str
        A datainfo object or a path to a datainfo object
    datatype : str
        A string to specify dataset type

    Returns
    -------
    pd.DataFrame
        The dataset
    """
    if not isinstance(datainfo, DataInfo):
        datainfo = DataInfo.read_json(datainfo)

    if datainfo.path is None:
        raise ValueError('datainfo.path is None')
    from pharmpy.model.external.nonmem.dataset import read_nonmem_dataset
    from pharmpy.model.external.nonmem.parsing import filter_observations

    if datatype == 'nonmem':
        drop = [col.drop for col in datainfo]
        df = read_nonmem_dataset(
            datainfo.path,
            ignore_character='@',
            drop=drop,
            colnames=datainfo.names,
            dtype=datainfo.get_dtype_dict(),
        )
        # This assumes a PK model
        df = filter_observations(df, datainfo)
    else:
        df = pd.read_csv(
            datainfo.path,
            sep=datainfo.separator,
            dtype=datainfo.get_dtype_dict(),
            float_precision='round_trip',
        )
    return df



def create_default_datainfo(path_or_df):
    if not isinstance(path_or_df, pd.DataFrame):
        path = path_absolute(path_or_df)
        datainfo_path = path.with_suffix('.datainfo')

        if datainfo_path.is_file():
            di = DataInfo.read_json(datainfo_path)
            di = di.replace(path=path)
            return di
        else:
            with open(path) as file:
                first_line = file.readline()
                if ',' not in first_line:
                    colnames = list(pd.read_csv(path, nrows=0, sep=r'\s+'))
                    separator = r'\s+'
                else:
                    colnames = list(pd.read_csv(path, nrows=0))
                    separator = ','
            if len(colnames) > 0:
                colnames[0] = colnames[0].lstrip('#')

    else:
        colnames = path_or_df.columns
        separator = None
        path = None

    column_info = []
    for colname in colnames:
        colname = colname.replace('.', '_')  # pandas uses . to name mangle
        if colname == 'ID' or colname == 'L1':
            info = ColumnInfo.create(colname, type='id', scale='nominal', datatype='int32')
        elif colname == 'DV':
            info = ColumnInfo.create(colname, type='dv')
        elif colname == 'TIME':
            if not set(colnames).isdisjoint({'DATE', 'DAT1', 'DAT2', 'DAT3'}):
                datatype = 'nmtran-time'
            else:
                datatype = 'float64'
            info = ColumnInfo.create(colname, type='idv', scale='ratio', datatype=datatype)
        elif colname == 'EVID':
            info = ColumnInfo.create(colname, type='event', scale='nominal')
        elif colname == 'MDV':
            if 'EVID' in colnames:
                info = ColumnInfo.create(colname, type='mdv')
            else:
                info = ColumnInfo.create(colname, type='event', scale='nominal', datatype='int32')
        elif colname == 'AMT':
            info = ColumnInfo.create(colname, type='dose', scale='ratio')
        elif colname == 'BLQ':
            info = ColumnInfo.create(colname, type='blq', scale='nominal', datatype='int32')
        elif colname == 'LLOQ':
            info = ColumnInfo.create(colname, type='lloq', scale='ratio')
        elif colname == 'DVID':
            info = ColumnInfo.create(colname, type='dvid', scale='nominal', datatype='int32')
        elif colname == 'SS':
            info = ColumnInfo.create(colname, type='ss', scale='nominal', datatype='int32')
        elif colname == 'II':
            info = ColumnInfo.create(colname, type='ii', scale='ratio')
        else:
            info = ColumnInfo.create(colname)
        column_info.append(info)
    di = DataInfo.create(column_info, path=path, separator=separator)
    return di



[docs]
def deidentify_data(
    df: pd.DataFrame, id_column: str = 'ID', date_columns: Optional[list[str]] = None
):
    """Deidentify a dataset

    Two operations are performed on the dataset:

    1. All ID numbers are randomized from the range 1 to n
    2. All columns containing dates will have the year changed

    The year change is done by letting the earliest year in the dataset
    be used as a reference and by maintaining leap years. The reference year
    will either be 1901, 1902, 1903 or 1904 depending on its distance to the closest
    preceeding leap year.

    Parameters
    ----------
    df : pd.DataFrame
        A dataset
    id_column : str
        Name of the id column
    date_columns : list
        Names of all date columns

    Returns
    -------
    pd.DataFrame
        Deidentified dataset
    """
    df = df.copy()
    df[id_column] = pd.to_numeric(df[id_column])
    resampler = resample_data(df, id_column)
    df, _ = next(resampler)

    if date_columns is None:
        return df
    for datecol in date_columns:
        if pd.api.types.is_datetime64_any_dtype(df[datecol]):
            pass
        elif df[datecol].dtype == 'object':
            # assume string
            df[datecol] = pd.to_datetime(df[datecol])
        else:
            raise ValueError(f"Column {datecol} does not seem to contain a date")
    earliest_date = df[date_columns].min().min()

    # Handle leap year modulo
    earliest_year_modulo = earliest_date.year % 4
    reference_offset = 4 if earliest_year_modulo == 0 else earliest_year_modulo
    reference_year = 1900 + reference_offset
    delta = earliest_date.year - reference_year

    def convert(x):
        new = x.replace(year=x.year - delta)
        return new

    for datecol in date_columns:
        df[datecol] = df[datecol].transform(convert)

    return df




[docs]
def unload_dataset(model: Model):
    """Unload the dataset from a model

    Parameters
    ----------
    model : Model
        Pharmpy model

    Returns
    -------
    Model
        Updated Pharmpy model

    Example
    -------
    >>> from pharmpy.modeling import load_example_model, unload_dataset
    >>> model = load_example_model("pheno")
    >>> model = unload_dataset(model)
    >>> model.dataset is None
    True

    """
    model = model.replace(dataset=None)
    return model




[docs]
def load_dataset(model: Model):
    """Load the dataset given datainfo

    Parameters
    ----------
    model : Model
        Pharmpy model

    Returns
    -------
    Model
        Updated Pharmpy model

    Example
    -------
    >>> from pharmpy.modeling import load_example_model, load_dataset, unload_dataset
    >>> model = load_example_model("pheno")
    >>> model = unload_dataset(model)
    >>> model.dataset is None
    True
    >>> model = load_dataset(model)
    >>> model.dataset
         ID   TIME   AMT  WGT  APGR    DV  FA1  FA2
    0     1    0.0  25.0  1.4   7.0   0.0  1.0  1.0
    1     1    2.0   0.0  1.4   7.0  17.3  0.0  0.0
    2     1   12.5   3.5  1.4   7.0   0.0  1.0  1.0
    3     1   24.5   3.5  1.4   7.0   0.0  1.0  1.0
    4     1   37.0   3.5  1.4   7.0   0.0  1.0  1.0
    ..   ..    ...   ...  ...   ...   ...  ...  ...
    739  59  108.3   3.0  1.1   6.0   0.0  1.0  1.0
    740  59  120.5   3.0  1.1   6.0   0.0  1.0  1.0
    741  59  132.3   3.0  1.1   6.0   0.0  1.0  1.0
    742  59  144.8   3.0  1.1   6.0   0.0  1.0  1.0
    743  59  146.8   0.0  1.1   6.0  40.2  0.0  0.0
    <BLANKLINE>
    [744 rows x 8 columns]

    """
    df = read_dataset_from_datainfo(model.datainfo)
    model = model.replace(dataset=df)
    return model




[docs]
def set_dataset(
    model: Model, path_or_df: Union[str, Path, pd.DataFrame], datatype: Optional[str] = None
):
    """Load the dataset given datainfo

    Parameters
    ----------
    model : Model
        Pharmpy model
    path_or_df : str, Path, or pd.DataFrame
        Dataset path or dataframe
    datatype : str
        Type of dataset (optional)

    Returns
    -------
    Model
        Updated Pharmpy model

    Example
    -------
    >>> from pharmpy.modeling import load_example_model, load_dataset, unload_dataset, set_dataset
    >>> model = load_example_model("pheno")
    >>> model = unload_dataset(model)
    >>> dataset_path = model.datainfo.path
    >>> model.dataset is None
    True
    >>> model = set_dataset(model, dataset_path, datatype='nonmem')
    >>> model.dataset
         ID   TIME   AMT  WGT  APGR    DV  FA1  FA2
    0     1    0.0  25.0  1.4   7.0   0.0  1.0  1.0
    1     1    2.0   0.0  1.4   7.0  17.3  0.0  0.0
    2     1   12.5   3.5  1.4   7.0   0.0  1.0  1.0
    3     1   24.5   3.5  1.4   7.0   0.0  1.0  1.0
    4     1   37.0   3.5  1.4   7.0   0.0  1.0  1.0
    ..   ..    ...   ...  ...   ...   ...  ...  ...
    739  59  108.3   3.0  1.1   6.0   0.0  1.0  1.0
    740  59  120.5   3.0  1.1   6.0   0.0  1.0  1.0
    741  59  132.3   3.0  1.1   6.0   0.0  1.0  1.0
    742  59  144.8   3.0  1.1   6.0   0.0  1.0  1.0
    743  59  146.8   0.0  1.1   6.0  40.2  0.0  0.0
    <BLANKLINE>
    [744 rows x 8 columns]

    """
    if isinstance(path_or_df, pd.DataFrame):
        df = path_or_df
        if datatype == 'nonmem':
            di = create_default_datainfo(path_or_df)
        else:
            di = DataInfo.create(columns=list(df.columns.values), path=None)
    else:
        path = normalize_user_given_path(path_or_df)
        if datatype == 'nonmem':
            di = create_default_datainfo(path)
        else:
            di = DataInfo.create(path=path)
        df = read_dataset_from_datainfo(di, datatype=datatype)
        di = update_datainfo(di, df).replace(path=path)

    if len(df.columns) == 1:
        warnings.warn('Could only find one column, should this be another datatype?')

    model = model.replace(dataset=df, datainfo=di)
    return model.update_source()




[docs]
def bin_observations(
    model: Model, method: Literal["equal_width", "equal_number"], nbins: int
) -> pd.Series:
    """Bin all observations on the independent variable

    Available binning methods:

    +---------------+-------------------------------------------------+
    | Method        | Description                                     |
    +===============+=================================================+
    | equal_width   | Bins with equal width based on the idv          |
    +---------------+-------------------------------------------------+
    | equal_number  | Bins containing an equal number of observations |
    +---------------+-------------------------------------------------+

    Parameters
    ----------
    model : Model
        Pharmpy model
    method : str
        Name of the binning method to use
    nbins : int
        The number of bins wanted

    Returns
    -------
    pd.Series
        A series of bin ids indexed on the original record index of the dataset
    list
        A list of bin edges

    Example
    -------
    >>> from pharmpy.modeling import load_example_model, bin_observations
    >>> model = load_example_model("pheno")
    >>> bins, boundaries = bin_observations(model, method="equal_width", nbins=10)
    >>> bins
    1      0
    11     2
    13     0
    19     1
    26     3
           ..
    719    1
    727    3
    729    0
    736    1
    743    3
    Name: TIME, Length: 155, dtype: int64
    >>> boundaries
    array([  0.  ,  39.88,  78.76, 117.64, 156.52, 195.4 , 234.28, 273.16,
           312.04, 350.92, 389.8 ])

    """

    df = get_and_check_dataset(model)
    observations = get_observations(model, keep_index=True)
    obs = df.loc[observations.index]
    idv = model.datainfo.idv_column.name
    sorted_idvs = obs[idv].sort_values()
    method_lower = method.lower()
    if method_lower == "equal_width":
        bincol, boundaries = pd.cut(sorted_idvs, nbins, labels=False, retbins=True)
        boundaries[0] = 0
    elif method_lower == "equal_number":
        bin_edges = _get_bin_edges_psn(sorted_idvs, nbins)
        bincol, boundaries = pd.cut(
            sorted_idvs, bin_edges, labels=False, retbins=True, include_lowest=True
        )
    else:
        raise ValueError(f"Unknown binning method {method}")
    sorted_bincol = bincol.sort_index()
    return sorted_bincol, boundaries



def _get_bin_edges_psn(data, n_bins):
    """Similar to function "get_bin_ceilings_from_count" from PsN

    Divide a list of data points into bins of equal count.
    """
    # Create a dictionary with unique values and their indices
    unique_values, value_indices = np.unique(data, return_inverse=True)
    value_dict = {i: val for i, val in enumerate(unique_values)}

    # Count occurrences of each unique value
    obs_count = pd.Series(value_indices).value_counts().sort_index().tolist()

    # Calculate the number of unique values
    n_values = len(value_dict)

    # Calculate the ideal count for each bin
    count_per_bin = len(data) / n_bins
    ideal_count = [count_per_bin] * n_bins

    bin_ceilings = [0]
    global_error = 0
    bin_index = 0
    local_error = -ideal_count[bin_index]

    for value_index, obs in enumerate(obs_count):
        if bin_index == len(ideal_count) - 1:
            bin_ceilings.append(unique_values[n_values - 1])
            break
        elif local_error == -ideal_count[bin_index]:
            local_error += obs
        elif obs == 0:
            continue
        elif abs(global_error + local_error) > abs(global_error + local_error + obs) and (
            n_values - value_index - 1
        ) > (len(ideal_count) - bin_index - 1):
            local_error += obs
        else:
            bin_ceilings.append(unique_values[value_index - 1])
            global_error += local_error
            bin_index += 1
            local_error = -ideal_count[bin_index] + obs

    # FIXME: This is a trick to not have 0 and 0
    if len(bin_ceilings) > 1 and bin_ceilings[0] == bin_ceilings[1]:
        bin_ceilings[1] += 0.00000001

    return bin_ceilings



[docs]
def binarize_dataset(
    model: Model, columns: Optional[list[str]], keep: bool = False, all_levels: bool = False
):
    """Binarize dataset

    Will create one column per category if specified, otherwise for all columns
    except for the last one. Will also update datainfo so that new columns have
    type covariate and is categorical.


    Parameters
    ----------
    model : Model
        Pharmpy model
    columns : list[str]
        The columns to binarize or None for all marked as categorical
    keep : bool
        Keep the original column in dataset (default is False)
    all_levels : bool
        Create one column per level, otherwise skip last value (default is False)

    Returns
    -------
    Model
        Updated Pharmpy model

    Examples
    --------
    >>> from pharmpy.modeling import *
    >>> model = load_example_model("pheno")
    >>> model = binarize_dataset(model, ['APGR'])
    >>> model.dataset
         ID   TIME   AMT  WGT    DV  FA1  FA2  APGR_1  APGR_2  APGR_3  APGR_4  APGR_5  APGR_6  APGR_7  APGR_8  APGR_9
    0     1    0.0  25.0  1.4   0.0  1.0  1.0       0       0       0       0       0       0       1       0       0
    1     1    2.0   0.0  1.4  17.3  0.0  0.0       0       0       0       0       0       0       1       0       0
    2     1   12.5   3.5  1.4   0.0  1.0  1.0       0       0       0       0       0       0       1       0       0
    3     1   24.5   3.5  1.4   0.0  1.0  1.0       0       0       0       0       0       0       1       0       0
    4     1   37.0   3.5  1.4   0.0  1.0  1.0       0       0       0       0       0       0       1       0       0
    ..   ..    ...   ...  ...   ...  ...  ...     ...     ...     ...     ...     ...     ...     ...     ...     ...
    739  59  108.3   3.0  1.1   0.0  1.0  1.0       0       0       0       0       0       1       0       0       0
    740  59  120.5   3.0  1.1   0.0  1.0  1.0       0       0       0       0       0       1       0       0       0
    741  59  132.3   3.0  1.1   0.0  1.0  1.0       0       0       0       0       0       1       0       0       0
    742  59  144.8   3.0  1.1   0.0  1.0  1.0       0       0       0       0       0       1       0       0       0
    743  59  146.8   0.0  1.1  40.2  0.0  0.0       0       0       0       0       0       1       0       0       0
    <BLANKLINE>
    [744 rows x 16 columns]
    """
    if model.dataset is None:
        return model

    model = infer_datatypes(model, columns)
    df = model.dataset
    di = model.datainfo
    if columns is None:
        try:
            covariates = di.typeix['covariate']
        except IndexError:
            raise ValueError(f'No covariates for `columns` found: {columns}')
        columns = [cov.name for cov in covariates if cov.is_categorical()]
        if not columns:
            return model

    changed = False

    for col in columns:
        levels = sorted(set(df[col].values))
        if not all_levels:
            levels = levels[0:-1]

        level_map = dict()
        for level in levels:
            if '.' in str(level):
                level = str(level).replace('.', '_')
            col_name = f'{col}_{level}'
            col_values = np.where(df[col] == level, 1, 0)
            level_map[col_name] = col_values

        if len(level_map) == 0:
            continue

        df = df.assign(**level_map)
        if not keep:
            df = df.drop(columns=[col])

        di = update_datainfo(di, df)
        for col_name in level_map.keys():
            ci = di[col_name]
            ci_new = ci.replace(
                type='covariate', continuous=False, scale='nominal', categories=(0, 1)
            )
            di = di.set_column(ci_new)

        changed = True

    if changed:
        model = model.replace(dataset=df, datainfo=di)

    return model




[docs]
def get_column_name(model: Model, type: str) -> Optional[str]:
    """Retrieve the column with a certain type

    If multiple columns have the same type an exception will be raised.

    Parameters
    ----------
    model : Model
        Pharmpy model
    type : str
        Column type. See :py:attr:pharmpy.model.datainfo.ColumnInfo.type

    Returns
    -------
    str or None
        Name of the column. None if no column found

    Example
    -------
    >>> from pharmpy.modeling import load_example_model, get_column_name
    >>> model = load_example_model("pheno")
    >>> get_column_name(model, "dose")
    'AMT'
    """
    try:
        columns = model.datainfo.typeix[type]
    except IndexError:
        name = None
    else:
        if len(columns) > 1:
            raise IndexError(f"More than one column of type {type}")
        name = columns[0].name
    return name