from __future__ import annotations
import re
import warnings
from pathlib import Path
from typing import Literal, Optional, Union
from pharmpy.basic import Expr, Unit
from pharmpy.deps import numpy as np
from pharmpy.deps import pandas as pd
from pharmpy.deps import sympy
from pharmpy.deps.rich import box as rich_box
from pharmpy.deps.rich import console as rich_console
from pharmpy.deps.rich import table as rich_table
from pharmpy.internals.fs.path import normalize_user_given_path, path_absolute
from pharmpy.model import (
ColumnInfo,
CompartmentalSystem,
DataInfo,
DatasetError,
Model,
get_and_check_dataset,
)
from pharmpy.model.model import update_datainfo
from .iterators import resample_data
[docs]
def get_ids(model: Model) -> list[int]:
"""Retrieve a list of all subject ids of the dataset
Parameters
----------
model : Model
Pharmpy model
Returns
-------
list
All subject ids
Example
-------
>>> from pharmpy.modeling import load_example_model, get_ids
>>> model = load_example_model("pheno")
>>> get_ids(model) # doctest: +ELLIPSIS
[1, 2, 3, ..., 57, 58, 59]
"""
df = get_and_check_dataset(model)
idcol = model.datainfo.id_column.name
ids = list(int(x) for x in df[idcol].unique())
return ids
[docs]
def get_number_of_individuals(model: Model):
"""Retrieve the number of individuals in the model dataset
Parameters
----------
model : Model
Pharmpy model
Returns
-------
int
Number of individuals in the model dataset
Examples
--------
>>> from pharmpy.modeling import get_number_of_individuals, load_example_model
>>> model = load_example_model("pheno")
>>> get_number_of_individuals(model)
59
Notes
-----
For NONMEM models this is the number of individuals of the active dataset, i.e. after filtering
of IGNORE and ACCEPT and removal of individuals with no observations.
See also
--------
get_number_of_observations : Get the number of observations in a dataset
get_number_of_observations_per_individual : Get the number of observations per individual in a
dataset
"""
return len(get_ids(model))
[docs]
def get_number_of_observations(model: Model):
"""Retrieve the total number of observations in the model dataset
Parameters
----------
model : Model
Pharmpy model
Returns
-------
int
Number of observations in the model dataset
Examples
--------
>>> from pharmpy.modeling import get_number_of_observations, load_example_model
>>> model = load_example_model("pheno")
>>> get_number_of_observations(model)
155
Notes
-----
For NONMEM models this is the number of observations of the active dataset, i.e. after filtering
of IGNORE and ACCEPT and removal of individuals with no observations.
See also
--------
get_number_of_individuals : Get the number of individuals in a dataset
get_number_of_observations_per_individual : Get the number of observations per individual in a
dataset
"""
return len(get_observations(model))
[docs]
def get_number_of_observations_per_individual(model: Model):
"""Number of observations for each individual
Parameters
----------
model : Model
Pharmpy model
Returns
-------
pd.Series
Number of observations in the model dataset
Examples
--------
>>> from pharmpy.modeling import get_number_of_observations_per_individual, load_example_model
>>> model = load_example_model("pheno")
>>> get_number_of_observations_per_individual(model)
ID
1 2
2 3
3 3
4 3
5 3
6 3
7 3
8 3
9 4
10 3
11 1
12 3
13 2
14 4
15 2
16 3
17 3
18 4
19 3
20 3
21 3
22 2
23 3
24 3
25 6
26 2
27 2
28 1
29 1
30 2
31 1
32 3
33 2
34 2
35 2
36 3
37 2
38 4
39 3
40 2
41 3
42 2
43 1
44 3
45 3
46 1
47 1
48 5
49 3
50 4
51 3
52 3
53 2
54 4
55 1
56 1
57 2
58 3
59 3
Name: observation_count, dtype: int64
Notes
-----
For NONMEM models this is the individuals and number of observations of the active dataset, i.e.
after filtering of IGNORE and ACCEPT and removal of individuals with no observations.
See also
--------
get_number_of_individuals : Get the number of individuals in a dataset
get_number_of_observations_per_individual : Get the number of observations per individual in a
dataset
"""
ser = get_observations(model).groupby(model.datainfo.id_column.name).count()
ser.name = "observation_count"
return ser
[docs]
def get_observations(model: Model, keep_index: bool = False) -> pd.Series:
"""Get observations from dataset
Parameters
----------
model : Model
Pharmpy model
keep_index : bool
Set to True if the original index should be kept.
Otherwise a new index using ID and idv will be created.
Returns
-------
pd.Series
Observations indexed over ID and TIME
Examples
--------
>>> from pharmpy.modeling import get_observations, load_example_model
>>> model = load_example_model("pheno")
>>> get_observations(model)
ID TIME
1 2.0 17.3
112.5 31.0
2 2.0 9.7
63.5 24.6
135.5 33.0
...
58 47.5 27.9
131.8 31.0
59 1.8 22.6
73.8 34.3
146.8 40.2
Name: DV, Length: 155, dtype: float64
See also
--------
get_number_of_observations : get the number of observations
get_number_of_observations_per_individual : get the number of observations per individual
"""
try:
label = model.datainfo.typeix['mdv'][0].name
except IndexError:
try:
label = model.datainfo.typeix['event'][0].name
except IndexError:
try:
label = model.datainfo.typeix['dose'][0].name
except IndexError:
label = None # All data records are observations
idcol = model.datainfo.id_column.name
idvcol = model.datainfo.idv_column.name
dvcol = model.datainfo.dv_column.name
df = get_and_check_dataset(model)
if label:
df = df.query(f'{label} == 0')
if df.empty:
df = df.astype({label: 'float'})
df = df.query(f'{label} == 0')
else:
df = df.copy()
if not keep_index:
df = df[[idcol, idvcol, dvcol]]
try:
# FIXME: This shouldn't be needed
df = df.astype({idvcol: np.float64})
except ValueError:
# TIME could not be converted to float (e.g. 10:15)
pass
df.set_index([idcol, idvcol], inplace=True)
df = df.squeeze()
else:
df = df[dvcol]
return df
[docs]
def get_baselines(model: Model):
"""Baselines for each subject.
Baseline is taken to be the first row even if that has a missing value.
Parameters
----------
model : Model
Pharmpy model
Returns
-------
pd.DataFrame
Dataset with the baselines
Examples
--------
>>> from pharmpy.modeling import load_example_model, get_baselines
>>> model = load_example_model("pheno")
>>> get_baselines(model)
TIME AMT WGT APGR DV FA1 FA2
ID
1 0.0 25.0 1.4 7.0 0.0 1.0 1.0
2 0.0 15.0 1.5 9.0 0.0 1.0 1.0
3 0.0 30.0 1.5 6.0 0.0 1.0 1.0
4 0.0 18.6 0.9 6.0 0.0 1.0 1.0
5 0.0 27.0 1.4 7.0 0.0 1.0 1.0
6 0.0 24.0 1.2 5.0 0.0 1.0 1.0
7 0.0 19.0 1.0 5.0 0.0 1.0 1.0
8 0.0 24.0 1.2 7.0 0.0 1.0 1.0
9 0.0 27.0 1.4 8.0 0.0 1.0 1.0
10 0.0 27.0 1.4 7.0 0.0 1.0 1.0
11 0.0 24.0 1.2 7.0 0.0 1.0 1.0
12 0.0 26.0 1.3 6.0 0.0 1.0 1.0
13 0.0 11.0 1.1 6.0 0.0 1.0 1.0
14 0.0 22.0 1.1 7.0 0.0 1.0 1.0
15 0.0 26.0 1.3 7.0 0.0 1.0 1.0
16 0.0 12.0 1.2 9.0 0.0 1.0 1.0
17 0.0 22.0 1.1 5.0 0.0 1.0 1.0
18 0.0 20.0 1.0 5.0 0.0 1.0 1.0
19 0.0 10.0 1.0 1.0 0.0 1.0 1.0
20 0.0 24.0 1.2 6.0 0.0 1.0 1.0
21 0.0 17.5 1.8 7.0 0.0 1.0 1.0
22 0.0 15.0 1.5 8.0 0.0 1.0 1.0
23 0.0 60.0 3.1 3.0 0.0 1.0 1.0
24 0.0 63.0 3.2 2.0 0.0 1.0 1.0
25 0.0 15.0 0.7 1.0 0.0 1.0 1.0
26 0.0 70.0 3.5 9.0 0.0 1.0 1.0
27 0.0 35.0 1.9 5.0 0.0 1.0 1.0
28 0.0 60.0 3.2 9.0 0.0 1.0 1.0
29 0.0 20.0 1.0 7.0 0.0 1.0 1.0
30 0.0 18.0 1.8 8.0 0.0 1.0 1.0
31 0.0 30.0 1.4 8.0 0.0 1.0 1.0
32 0.0 70.0 3.6 9.0 0.0 1.0 1.0
33 0.0 17.0 1.7 8.0 0.0 1.0 1.0
34 0.0 34.0 1.7 4.0 0.0 1.0 1.0
35 0.0 25.0 2.5 5.0 0.0 1.0 1.0
36 0.0 30.0 1.5 5.0 0.0 1.0 1.0
37 0.0 24.0 1.2 9.0 0.0 1.0 1.0
38 0.0 26.0 1.3 8.0 0.0 1.0 1.0
39 0.0 56.0 1.9 10.0 0.0 1.0 1.0
40 0.0 19.0 1.1 3.0 0.0 1.0 1.0
41 0.0 34.0 1.7 7.0 0.0 1.0 1.0
42 0.0 28.0 2.8 9.0 0.0 1.0 1.0
43 0.0 18.0 0.9 1.0 0.0 1.0 1.0
44 0.0 14.0 1.4 7.0 0.0 1.0 1.0
45 0.0 16.0 0.8 2.0 0.0 1.0 1.0
46 0.0 11.0 1.1 8.0 0.0 1.0 1.0
47 0.0 40.0 2.6 9.0 0.0 1.0 1.0
48 0.0 14.0 0.7 8.0 0.0 1.0 1.0
49 0.0 26.0 1.3 8.0 0.0 1.0 1.0
50 0.0 20.0 1.1 6.0 0.0 1.0 1.0
51 0.0 18.0 0.9 9.0 0.0 1.0 1.0
52 0.0 9.5 0.9 7.0 0.0 1.0 1.0
53 0.0 17.0 1.7 8.0 0.0 1.0 1.0
54 0.0 18.0 1.8 8.0 0.0 1.0 1.0
55 0.0 25.0 1.1 4.0 0.0 1.0 1.0
56 0.0 12.0 0.6 4.0 0.0 1.0 1.0
57 0.0 20.0 2.1 6.0 0.0 1.0 1.0
58 0.0 14.0 1.4 8.0 0.0 1.0 1.0
59 0.0 22.8 1.1 6.0 0.0 1.0 1.0
"""
idlab = model.datainfo.id_column.name
baselines = model.dataset.groupby(idlab).nth(0).set_index(idlab)
return baselines
[docs]
def set_covariates(model: Model, covariates: list[str]):
"""Set columns in the dataset to be covariates in the datainfo
Parameters
----------
model : Model
Pharmpy model
covariates : list
List of column names
Returns
-------
Model
Pharmpy model object
"""
di = model.datainfo
newcols = []
for col in di:
if col.name in covariates:
newcol = col.replace(type='covariate')
newcols.append(newcol)
else:
newcols.append(col)
model = model.replace(datainfo=di.replace(columns=newcols))
return model.update_source()
[docs]
def set_dvid(model: Model, name: str):
"""Set a column to act as DVID. Replace DVID if one is already set.
Parameters
----------
model : Model
Pharmpy model
name : str
Name of DVID column
Returns
-------
Model
Pharmpy model object
"""
di = model.datainfo
col = di[name]
if col.type == 'dvid':
return model
try:
curdvid = di.typeix['dvid'][0]
except IndexError:
pass
else:
curdvid = curdvid.replace(type='unknown')
di = di.set_column(curdvid)
col = col.replace(
type='dvid',
unit=1,
scale='nominal',
continuous=False,
drop=False,
descriptor='observation identifier',
)
df = model.dataset
if not col.is_integer():
ser = df[name]
converted = pd.to_numeric(ser, downcast='integer')
if not pd.api.types.is_integer_dtype(converted):
raise ValueError(
f"Could not use column {name} as DVID because it contains non-integral values"
)
df = df.assign(**{name: converted})
col = col.replace(datatype=ColumnInfo.convert_pd_dtype_to_datatype(converted.dtype))
new_dataset = True
else:
new_dataset = False
col = col.replace(categories=sorted(df[name].unique()))
di = di.set_column(col)
if new_dataset:
model = model.replace(datainfo=di, dataset=df)
else:
model = model.replace(datainfo=di)
return model.update_source()
[docs]
def get_covariate_baselines(model: Model):
"""Return a dataframe with baselines of all covariates for each id.
Baseline is taken to be the first row even if that has a missing value.
Parameters
----------
model : Model
Pharmpy model
Returns
-------
pd.DataFrame
covariate baselines
See also
--------
get_baselines : baselines for all data columns
Example
-------
>>> from pharmpy.modeling import load_example_model, get_covariate_baselines, set_covariates
>>> model = load_example_model("pheno")
>>> model = set_covariates(model, ["WGT", "APGR"])
>>> get_covariate_baselines(model)
WGT APGR
ID
1 1.4 7.0
2 1.5 9.0
3 1.5 6.0
4 0.9 6.0
5 1.4 7.0
6 1.2 5.0
7 1.0 5.0
8 1.2 7.0
9 1.4 8.0
10 1.4 7.0
11 1.2 7.0
12 1.3 6.0
13 1.1 6.0
14 1.1 7.0
15 1.3 7.0
16 1.2 9.0
17 1.1 5.0
18 1.0 5.0
19 1.0 1.0
20 1.2 6.0
21 1.8 7.0
22 1.5 8.0
23 3.1 3.0
24 3.2 2.0
25 0.7 1.0
26 3.5 9.0
27 1.9 5.0
28 3.2 9.0
29 1.0 7.0
30 1.8 8.0
31 1.4 8.0
32 3.6 9.0
33 1.7 8.0
34 1.7 4.0
35 2.5 5.0
36 1.5 5.0
37 1.2 9.0
38 1.3 8.0
39 1.9 10.0
40 1.1 3.0
41 1.7 7.0
42 2.8 9.0
43 0.9 1.0
44 1.4 7.0
45 0.8 2.0
46 1.1 8.0
47 2.6 9.0
48 0.7 8.0
49 1.3 8.0
50 1.1 6.0
51 0.9 9.0
52 0.9 7.0
53 1.7 8.0
54 1.8 8.0
55 1.1 4.0
56 0.6 4.0
57 2.1 6.0
58 1.4 8.0
59 1.1 6.0
"""
covariates = model.datainfo.typeix['covariate'].names
idlab = model.datainfo.id_column.name
df = model.dataset[covariates + [idlab]]
df = df.set_index(idlab)
return df.groupby(idlab).nth(0)
[docs]
def list_time_varying_covariates(model: Model):
"""Return a list of names of all time varying covariates
Parameters
----------
model : Model
Pharmpy model
Returns
-------
list
Names of all time varying covariates
See also
--------
get_covariate_baselines : get baselines for all covariates
Example
-------
>>> from pharmpy.modeling import load_example_model, list_time_varying_covariates
>>> model = load_example_model("pheno")
>>> list_time_varying_covariates(model)
[]
"""
cov_labels = model.datainfo.typeix['covariate'].names
if len(cov_labels) == 0:
return []
else:
time_var = (
model.dataset.groupby(by=model.datainfo.id_column.name)[cov_labels]
.nunique()
.gt(1)
.any()
)
return list(time_var.index[time_var])
[docs]
def get_doses(model: Model):
"""Get a series of all doses
Indexed with ID and TIME
Parameters
----------
model : Model
Pharmpy model
Returns
-------
pd.Series
doses
Example
-------
>>> from pharmpy.modeling import load_example_model, get_doses
>>> model = load_example_model("pheno")
>>> get_doses(model)
ID TIME
1 0.0 25.0
12.5 3.5
24.5 3.5
37.0 3.5
48.0 3.5
...
59 96.0 3.0
108.3 3.0
120.5 3.0
132.3 3.0
144.8 3.0
Name: AMT, Length: 589, dtype: float64
"""
try:
label = model.datainfo.typeix['dose'][0].name
except IndexError:
raise DatasetError('Could not identify dosing rows in dataset')
idcol = model.datainfo.id_column.name
idvcol = model.datainfo.idv_column.name
df = model.dataset.query(f'{label} != 0')
df = df[[idcol, idvcol, label]]
try:
# FIXME: This shouldn't be needed
df = df.astype({idvcol: np.float64})
except ValueError:
# TIME could not be converted to float (e.g. 10:15)
pass
df.set_index([idcol, idvcol], inplace=True)
return df.squeeze()
[docs]
def expand_additional_doses(model: Model, flag: bool = False):
"""Expand additional doses into separate dose records
Parameters
----------
model : Model
Pharmpy model object
flag : bool
True to add a boolean EXPANDED column to mark added records. In this case all
columns in the original dataset will be kept. Care needs to be taken to handle
the new dataset.
Returns
-------
Model
Pharmpy model object
"""
try:
addl = model.datainfo.typeix['additional'][0].name
ii = model.datainfo.typeix['ii'][0].name
except IndexError:
return model
idv = model.datainfo.idv_column.name
idcol = model.datainfo.id_column.name
df = get_and_check_dataset(model).copy()
try:
event = model.datainfo.typeix['event'][0].name
except IndexError:
df['_RESETGROUP'] = 1.0
else:
df['_FLAG'] = df[event] >= 3
df['_RESETGROUP'] = df.groupby('ID')['_FLAG'].cumsum()
df.drop('_FLAG', axis=1, inplace=True)
def fn(a):
if a[addl] == 0:
times = [a[idv]]
expanded = [False]
else:
length = int(a[addl])
times = [a[ii] * x + a[idv] for x in range(length + 1)]
expanded = [False] + [True] * length
a['_TIMES'] = times
a['_EXPANDED'] = expanded
return a
df = df.apply(fn, axis=1)
df = df.apply(lambda x: x.explode() if x.name in ['_TIMES', '_EXPANDED'] else x)
df = df.astype({'_EXPANDED': np.bool_})
df = df.groupby([idcol, '_RESETGROUP'], group_keys=False)[df.columns].apply(
lambda x: x.sort_values(by='_TIMES', kind='stable')
)
df[idv] = df['_TIMES'].astype(np.float64)
df.drop(['_TIMES', '_RESETGROUP'], axis=1, inplace=True)
if flag:
df.rename(columns={'_EXPANDED': 'EXPANDED'}, inplace=True)
else:
df.drop([addl, ii, '_EXPANDED'], axis=1, inplace=True)
model = model.replace(dataset=df.reset_index(drop=True))
return model.update_source()
[docs]
def get_doseid(model: Model):
"""Get a DOSEID series from the dataset with an id of each dose period starting from 1
If a a dose and observation exist at the same time point the observation will be counted
towards the previous dose.
Parameters
----------
model : Model
Pharmpy model
Returns
-------
pd.Series
DOSEIDs
Examples
--------
>>> from pharmpy.modeling import load_example_model, get_doseid
>>> model = load_example_model("pheno")
>>> get_doseid(model) # doctest: +ELLIPSIS
0 1
1 1
2 2
3 3
4 4
..
739 10
740 11
741 12
742 13
743 13
Name: DOSEID, Length: 744, dtype: int...
"""
try:
dose = model.datainfo.typeix['dose'][0].name
except IndexError:
raise DatasetError('Could not identify dosing rows in dataset')
df = model.dataset.copy()
df['DOSEID'] = df[dose]
df.loc[df['DOSEID'] > 0, 'DOSEID'] = 1
df['DOSEID'] = df['DOSEID'].astype(int)
idcol = model.datainfo.id_column.name
df['DOSEID'] = df.groupby(idcol)['DOSEID'].cumsum()
# Adjust for dose and observation at the same time point
# Observation is moved to previous dose group
# Except for steady state dose where the dose group is kept
try:
eventcol = model.datainfo.typeix['event'][0].name
except IndexError:
df['_RESETGROUP'] = 1.0
else:
df['_FLAG'] = df[eventcol] >= 3
df['_RESETGROUP'] = df.groupby('ID')['_FLAG'].cumsum()
try:
ss = model.datainfo.typeix['ss'][0].name
except IndexError:
ss = None
idvcol = model.datainfo.idv_column.name
ser = df.groupby([idcol, idvcol, '_RESETGROUP']).size()
nonunique = ser[ser > 1]
for i, time, _ in nonunique.index:
groupind = df[(df[idcol] == i) & (df[idvcol] == time)].index
obsind = df[(df[idcol] == i) & (df[idvcol] == time) & (df[dose] == 0)].index
doseind = set(groupind) - set(obsind)
if not doseind:
continue
maxind = max(doseind)
for index in obsind:
if 0 in groupind: # This is the first dose
continue
if maxind > index: # Dose record is after the observation
continue
if ss and df.loc[maxind, ss] > 0: # No swap for SS dosing
continue
curdoseid = df.loc[index, 'DOSEID']
df.loc[index, 'DOSEID'] = curdoseid - 1
return df['DOSEID'].copy()
[docs]
def get_mdv(model: Model):
"""Get MDVs from dataset
Parameters
----------
model : Model
Pharmpy model
Returns
-------
pd.Series
MDVs
"""
found = False
for key in ['mdv', 'event', 'dose']:
try:
label = model.datainfo.typeix[key][0].name
found = True
break
except IndexError:
pass
else:
label = model.datainfo.dv_column.name
data = get_and_check_dataset(model)[label].astype('float64').squeeze()
series = data.where(data == 0, other=1) if found else pd.Series(np.zeros(len(data)))
return series.astype('int32').rename('MDV')
[docs]
def get_evid(model: Model):
"""Get the evid from model dataset
If an event column is present this will be extracted otherwise
an evid column will be created.
Parameters
----------
model : Model
Pharmpy model
Returns
-------
pd.Series
EVID
"""
di = model.datainfo
try:
eventcols = di.typeix['event']
except IndexError:
pass
else:
return model.dataset[eventcols[0].name]
mdv = get_mdv(model)
return mdv.rename('EVID')
[docs]
def get_admid(model: Model):
"""Get the admid from model dataset
If an administration column is present this will be extracted otherwise
an admid column will be created based on the admids of the present doses.
This is dependent on the presence of a CMT column to be generated correctly.
When generated, admids of events in between doses is set to the last used
admid.
Parameters
----------
model : Model
Pharmpy model
Returns
-------
pd.Series
ADMID
"""
di = model.datainfo
try:
admidcols = di.typeix["admid"]
except IndexError:
pass
else:
return model.dataset[admidcols[0].name]
odes = model.statements.ode_system
names = odes.compartment_names
remap = {}
if isinstance(odes, CompartmentalSystem):
for dosing in odes.dosing_compartments:
remap[names.index(dosing.name) + 1] = dosing.doses[0].admid
adm = get_cmt(model)
adm = adm.replace(remap)
adm.name = "ADMID"
# Replace all observations with the previous admid type
current_admin = adm[0]
current_subject = model.dataset["ID"][0]
for i, data in enumerate(zip(get_evid(model), adm, model.dataset["ID"])):
event = data[0]
admin = data[1]
subject = data[2]
if current_subject == subject:
if event == 1:
current_admin = admin
if event != 1:
if current_admin is not None:
adm[i] = current_admin
else:
current_subject = subject
current_admin = admin
return adm
[docs]
def add_admid(model: Model):
"""
Add an admid column to the model dataset and datainfo. Dependent on the
presence of a CMT column in order to add admid correctly.
When generated, admids of events in between doses is set to the last used
admid.
Parameters
----------
model : Model
Pharmpy model
Returns
-------
model : Model
Pharmpy model
See also
--------
get_admid : Get or create an admid column
get_cmt : Get or create a cmt column
"""
di = model.datainfo
if "admid" not in di.types:
adm = get_admid(model)
dataset = model.dataset
dataset["ADMID"] = adm
di = update_datainfo(model.datainfo, dataset)
colinfo = di['ADMID'].replace(type='admid')
model = model.replace(datainfo=di.set_column(colinfo), dataset=dataset)
return model.update_source()
def set_admid(model: Model, column_name: str):
"""
Set the specified column in the dataset to the admid data type.
Parameters
----------
model : Model
Pharmpy model
column_name : str
name of column to set as admid
Returns
-------
model : Model
Pharmpy model
See also
--------
get_admid : Get or create an admid column
add_admid : Add an admid column to the dataset
"""
di = model.datainfo
colinfo = di[column_name].replace(type="admid")
model = model.replace(datainfo=di.set_column(colinfo))
return model.update_source()
[docs]
def get_cmt(model: Model):
"""Get the cmt (compartment) column from the model dataset
If a cmt column is present this will be extracted otherwise
a cmt column will be created. If created, multiple dose compartments are
dependent on the presence of an admid type column, otherwise, dose/non-dose
will be considered.
Parameters
----------
model : Model
Pharmpy model
Returns
-------
pd.Series
CMT
"""
di = model.datainfo
try:
cmtcols = di.typeix['compartment']
except IndexError:
pass
else:
return model.dataset[cmtcols[0].name]
# See if admid exist
try:
admidcols = di.typeix["admid"]
except IndexError:
# No admid found --> Assume dose/non-dose
odes = model.statements.ode_system
if isinstance(odes, CompartmentalSystem):
dosing = odes.dosing_compartments[0]
names = odes.compartment_names
dose_cmt = names.index(dosing.name) + 1
else:
dose_cmt = 1
cmt = get_evid(model)
cmt = cmt.replace({1: dose_cmt, 2: 0, 3: 0, 4: dose_cmt}) # Only consider dose/non-dose
cmt.name = "CMT"
return cmt
else:
admidcols = model.dataset[admidcols[0].name]
# Admid found -> convert to CMT based on doses
odes = model.statements.ode_system
names = odes.compartment_names
remap = {}
if isinstance(odes, CompartmentalSystem):
for dosing in odes.dosing_compartments:
if dosing == odes.central_compartment:
remap[2] = names.index(dosing.name) + 1
central_number = names.index(dosing.name) + 1
else:
remap[1] = names.index(dosing.name) + 1
admidcols = admidcols.replace(remap)
admidcols.loc[get_evid(model) == 0] = central_number
admidcols.name = "ADMID"
return admidcols
[docs]
def add_cmt(model: Model):
"""Add a CMT column to the model dataset and datainfo if not existed
In case of multiple doses, this method is dependent on the presence of an
admid column to correctly number each dose.
NOTE : Existing CMT is based on datainfo type being set to 'compartment'
and a column named 'CMT' can be replaced
Parameters
----------
model : Model
Pharmpy model
Returns
-------
model : Model
Pharmpy model
See also
--------
get_admid : Get or create an admid column
get_cmt : Get or create a cmt column
"""
di = model.datainfo
if "compartment" not in di.types:
cmt_name = "CMT"
cmt = get_cmt(model)
dataset = model.dataset
dataset[cmt_name] = cmt
di = update_datainfo(model.datainfo, dataset)
colinfo = di[cmt_name].replace(type='compartment')
model = model.replace(datainfo=di.set_column(colinfo), dataset=dataset)
return model.update_source()
[docs]
def add_time_after_dose(model: Model):
"""Calculate and add a TAD column to the dataset
Parameters
----------
model : Model
Pharmpy model
Returns
-------
Model
Pharmpy model object
Examples
--------
>>> from pharmpy.modeling import load_example_model, add_time_after_dose
>>> model = load_example_model("pheno")
>>> model = add_time_after_dose(model)
"""
try:
model.datainfo.descriptorix['time after dose']
except IndexError:
pass
else:
# Already have time after dose
return model
temp = translate_nmtran_time(model)
idv = temp.datainfo.idv_column.name
idlab = temp.datainfo.id_column.name
df = get_and_check_dataset(model).copy()
df['_NEWTIME'] = temp.dataset[idv]
try:
addl = temp.datainfo.typeix['additional'][0].name
except IndexError:
addl = None
else:
# FIXME: Temp workaround, should be canonicalized in Model.replace
di = update_datainfo(temp.datainfo, df)
new_idvcol = di.idv_column.replace(type='unknown')
new_timecol = di['_NEWTIME'].replace(type='idv')
di = di.set_column(new_idvcol).set_column(new_timecol)
temp = temp.replace(datainfo=di, dataset=df)
temp = expand_additional_doses(temp, flag=True)
df = temp.dataset
df['_DOSEID'] = get_doseid(temp)
# Sort in case DOSEIDs are non-increasing
df = (
df.groupby(idlab)[df.columns]
.apply(lambda x: x.sort_values(by=['_DOSEID'], kind='stable', ignore_index=True))
.reset_index(drop=True)
)
df['TAD'] = df.groupby([idlab, '_DOSEID'])['_NEWTIME'].diff().fillna(0.0)
df['TAD'] = df.groupby([idlab, '_DOSEID'])['TAD'].cumsum()
if addl:
df = df[~df['EXPANDED']].reset_index(drop=True)
df.drop(columns=['EXPANDED'], inplace=True)
df.drop(columns=['_NEWTIME', '_DOSEID'], inplace=True)
# FIXME: Temp workaround, should be canonicalized in Model.replace
di = update_datainfo(model.datainfo, df)
colinfo = di['TAD'].replace(descriptor='time after dose', unit=di[idv].unit)
model = model.replace(datainfo=di.set_column(colinfo), dataset=df)
return model.update_source()
[docs]
def get_concentration_parameters_from_data(model: Model):
"""Create a dataframe with concentration parameters
Note that all values are directly calculated from the dataset
Parameters
----------
model : Model
Pharmpy model object
Returns
-------
pd.DataFrame
Concentration parameters
Examples
--------
>>> from pharmpy.modeling import load_example_model, get_concentration_parameters_from_data
>>> model = load_example_model("pheno")
>>> get_concentration_parameters_from_data(model)
Cmax Tmax Cmin Tmin
ID DOSEID
1 1 17.3 2.0 NaN NaN
2 NaN NaN NaN NaN
3 NaN NaN NaN NaN
4 NaN NaN NaN NaN
5 NaN NaN NaN NaN
... ... ... ... ...
59 9 NaN NaN NaN NaN
10 NaN NaN NaN NaN
11 NaN NaN NaN NaN
12 NaN NaN NaN NaN
13 40.2 2.0 NaN NaN
<BLANKLINE>
[589 rows x 4 columns]
"""
model = add_time_after_dose(model)
doseid = get_doseid(model)
df = get_and_check_dataset(model).copy()
df['DOSEID'] = doseid
idlab = model.datainfo.id_column.name
dv = model.datainfo.dv_column.name
noobs = df.groupby([idlab, 'DOSEID']).size() == 1
idx = df.groupby([idlab, 'DOSEID'])[dv].idxmax()
params = df.loc[idx].set_index([idlab, 'DOSEID'])
params = params[[dv, 'TAD']]
params.rename(columns={dv: 'Cmax', 'TAD': 'Tmax'}, inplace=True)
params.loc[noobs] = np.nan
grpind = df.groupby(['ID', 'DOSEID']).indices
keep = []
for ind, rows in grpind.items():
index = idx.loc[ind]
p = params.loc[ind]
if not np.isnan(p['Tmax']):
keep += [row for row in rows if row > index]
minidx = df.iloc[keep].groupby([idlab, 'DOSEID'])[dv].idxmin()
params2 = df.loc[minidx].set_index([idlab, 'DOSEID'])
params2 = params2[[dv, 'TAD']]
params2.rename(columns={dv: 'Cmin', 'TAD': 'Tmin'}, inplace=True)
res = params.join(params2)
return res
[docs]
def drop_dropped_columns(model: Model):
"""Drop columns marked as dropped from the dataset
NM-TRAN date columns will not be dropped by this function
even if marked as dropped.
Columns not specified in the datainfo ($INPUT for NONMEM)
will also be dropped from the dataset.
Parameters
----------
model : Model
Pharmpy model object
Returns
-------
Model
Pharmpy model object
Example
-------
>>> from pharmpy.modeling import *
>>> model = load_example_model("pheno")
>>> model = drop_dropped_columns(model)
>>> list(model.dataset.columns)
['ID', 'TIME', 'AMT', 'WGT', 'APGR', 'DV', 'FA1', 'FA2']
See also
--------
drop_columns : Drop specific columns or mark them as drop
"""
datainfo = model.datainfo
todrop = [
colname
for colname in datainfo.names
if datainfo[colname].drop and datainfo[colname].datatype != 'nmtran-date'
]
df = get_and_check_dataset(model)
todrop += list(set(df.columns) - set(datainfo.names))
model = drop_columns(model, todrop)
return model.update_source()
[docs]
def drop_columns(model: Model, column_names: Union[list[str], str], mark: bool = False):
"""Drop columns from the dataset or mark as dropped
Parameters
----------
model : Model
Pharmpy model object
column_names : list or str
List of column names or one column name to drop or mark as dropped
mark : bool
Default is to remove column from dataset. Set this to True to only mark as dropped
Returns
-------
Model
Pharmpy model object
Example
-------
>>> from pharmpy.modeling import *
>>> model = load_example_model("pheno")
>>> model = drop_columns(model, ['WGT', 'APGR'])
>>> list(model.dataset.columns)
['ID', 'TIME', 'AMT', 'DV', 'FA1', 'FA2']
See also
--------
drop_dropped_columns : Drop all columns marked as drop
undrop_columns : Undrop columns of model
"""
if isinstance(column_names, str):
column_names = [column_names]
di = model.datainfo
newcols, to_drop = [], []
for col in di:
if col.name in column_names:
if mark:
newcol = col.replace(drop=True)
newcols.append(newcol)
else:
to_drop.append(col.name)
else:
newcols.append(col)
replace_dict = {'datainfo': di.replace(columns=newcols)}
if to_drop:
df = get_and_check_dataset(model).copy()
replace_dict['dataset'] = df.drop(to_drop, axis=1)
model = model.replace(**replace_dict)
return model.update_source()
[docs]
def undrop_columns(model: Model, column_names: Union[list[str], str]):
"""Undrop columns of model
Parameters
----------
model : Model
Pharmpy model object
column_names : list or str
List of column names or one column name to undrop
Returns
-------
Model
Pharmpy model object
Example
-------
>>> from pharmpy.modeling import *
>>> model = load_example_model("pheno")
>>> model = drop_columns(model, ['WGT', 'APGR'], mark=True)
>>> model = undrop_columns(model, 'WGT')
See also
--------
drop_dropped_columns : Drop all columns marked as drop
drop_columns : Drop or mark columns as dropped
"""
if isinstance(column_names, str):
column_names = [column_names]
di = model.datainfo
newcols = []
for col in di:
if col.name in column_names:
newcol = col.replace(drop=False)
newcols.append(newcol)
else:
newcols.append(col)
model = model.replace(datainfo=di.replace(columns=newcols))
return model.update_source()
def _translate_nonmem_time_value(time):
if ':' in time:
components = time.split(':')
if len(components) != 2:
raise DatasetError(f'Bad TIME format: {time}')
hours = float(components[0]) + float(components[1]) / 60
return hours
else:
return float(time)
def _translate_time_column(df, timecol, idcol):
if df[timecol].dtype != np.float64:
df[timecol] = df[timecol].apply(_translate_nonmem_time_value)
df[timecol] = df[timecol] - df.groupby(idcol)[timecol].transform('first')
return df
def _translate_nonmem_time_and_date_value(ser, timecol, datecol):
timeval = _translate_nonmem_time_value(ser[timecol])
date = ser[datecol]
a = re.split(r'[^0-9]', date)
if date.startswith('-') or len(a) == 1:
return timeval + float(date) * 24
elif len(a) == 2:
year = 2001 # Non leap year
month = a[1]
day = a[0]
elif len(a) == 3:
if datecol.endswith('E'):
month = a[0]
day = a[1]
year = a[2]
elif datecol.endswith('1'):
day = a[0]
month = a[1]
year = a[2]
elif datecol.endswith('3'):
year = a[0]
day = a[1]
month = a[2]
else: # Let DAT2 be default if other name
year = a[0]
month = a[1]
day = a[2]
if len(year) < 3:
year = int(year)
if year > 50:
year += 1900
else:
year += 2000
else:
year = int(year)
month = int(month)
day = int(day)
hour = int(timeval)
timeval = (timeval - hour) * 60
minute = int(timeval)
timeval = (timeval - minute) * 60
second = int(timeval)
timeval = (timeval - second) * 1000000
microsecond = int(timeval)
timeval = (timeval - microsecond) * 1000
nanosecond = int(timeval)
ts = pd.Timestamp(
year=year,
month=month,
day=day,
hour=hour,
minute=minute,
second=second,
microsecond=microsecond,
nanosecond=nanosecond,
)
return ts
else:
raise DatasetError(f'Bad DATE value: {date}')
def _translate_time_and_date_columns(df, timecol, datecol, idcol):
df[timecol] = df.apply(
_translate_nonmem_time_and_date_value, axis=1, timecol=timecol, datecol=datecol
)
timediff = df[timecol] - df.groupby(idcol)[timecol].transform('first')
if df[timecol].dtype != np.float64:
df[timecol] = timediff.dt.total_seconds() / 3600
return df
def _find_time_and_date_columns(model):
# Both time and date can be None. If date is None time must be not None
time = None
date = None
di = model.datainfo
for col in di:
if col.datatype == 'nmtran-time' and not col.drop:
if time is None:
time = col
else:
raise ValueError(f"Multiple time columns found {time} and {col.name}")
elif col.datatype == 'nmtran-date' and not col.drop:
if date is None:
date = col
else:
raise ValueError(f"Multiple date columns found {date} and {col.name}")
if time is None and date is not None:
raise ValueError(f"Found date column {date}, but no time column")
return time, date
[docs]
def translate_nmtran_time(model: Model):
"""Translate NM-TRAN TIME and DATE column into one TIME column
If dataset of model have special NM-TRAN TIME and DATE columns these
will be translated into one single time column with time in hours.
Warnings
--------
Use this function with caution. For example reset events are currently not taken into account.
Parameters
----------
model : Model
Pharmpy model object
Returns
-------
Model
Pharmpy model object
"""
if model.dataset is None:
return model
timecol, datecol = _find_time_and_date_columns(model)
df = model.dataset.copy()
di = model.datainfo
idname = di.id_column.name
if datecol is None:
if timecol is None:
return model
else:
df = _translate_time_column(df, timecol.name, idname)
else:
assert timecol is not None
df = _translate_time_and_date_columns(df, timecol.name, datecol.name, idname)
model = drop_columns(model, datecol.name)
timecol = timecol.replace(unit='h')
timecol = timecol.replace(datatype='float64')
di = di.set_column(timecol)
model = model.replace(datainfo=di, dataset=df)
return model.update_source()
def _loq_mask(
model: Model,
lloq: Optional[Union[float, str]] = None,
uloq: Optional[Union[float, str]] = None,
blq: Optional[str] = None,
alq: Optional[str] = None,
):
"""Boolean series with False for lloq records and True for non-lloq
Options as remove_loq_data
"""
if blq and lloq:
raise ValueError("Cannot specify blq and lloq at the same time")
if alq and uloq:
raise ValueError("Cannot specify alq and uloq at the same time")
df = get_and_check_dataset(model)
if lloq is not None or uloq is not None:
dv = model.datainfo.dv_column.name
mdv = get_mdv(model)
which_keep = pd.Series(True, index=df.index)
if isinstance(lloq, str):
lloq = df[lloq]
if isinstance(uloq, str):
uloq = df[uloq]
if lloq is not None:
which_keep &= (df[dv] > lloq) | mdv
elif blq is not None:
which_keep &= (df[blq] == 0) | mdv
if uloq is not None:
which_keep &= (df[dv] < uloq) | mdv
elif alq is not None:
which_keep &= (df[alq] == 0) | mdv
return which_keep
[docs]
def remove_loq_data(
model: Model,
lloq: Optional[Union[float, str]] = None,
uloq: Optional[Union[float, str]] = None,
blq: Optional[str] = None,
alq: Optional[str] = None,
keep: int = 0,
):
"""Remove loq data records from the dataset
Does nothing if none of the limits are specified.
Parameters
----------
model : Model
Pharmpy model object
lloq : float or str
Value or column name for lower limit of quantification.
uloq : float or str
Value or column name for upper limit of quantification.
blq : str
Column name for below limit of quantification indicator.
alq : str
Column name for above limit of quantification indicator.
keep : int
Number of loq records to keep for each run of consecutive loq records.
Returns
-------
Model
Pharmpy model object
Examples
--------
>>> from pharmpy.modeling import *
>>> model = load_example_model("pheno")
>>> model = remove_loq_data(model, lloq=10, uloq=40)
>>> len(model.dataset)
736
See also
--------
set_lloq_data
transform_blq
"""
which_keep = _loq_mask(model, lloq=lloq, uloq=uloq, blq=blq, alq=alq)
df = get_and_check_dataset(model)
if keep > 0:
idcol = model.datainfo.id_column.name
keep_df = pd.DataFrame(
{'ID': df[idcol], 'consec': (~which_keep).diff().ne(0).cumsum(), 'remove': ~which_keep}
)
obj = keep_df.groupby([idcol, 'consec']).cumsum().le(keep)['remove']
which_keep = obj | which_keep
model = model.replace(dataset=df[which_keep])
return model.update_source()
[docs]
def set_lloq_data(
model: Model,
value: Union[str, float, Expr],
lloq: Optional[Union[float, str]] = None,
blq: Optional[str] = None,
):
"""Set a dv value for lloq data records
Parameters
----------
model : Model
Pharmpy model object
value : float or Expr
The new dv value
lloq : float or str
Value or column name for lower limit of quantification.
blq : str
Column name for below limit of quantification indicator.
Returns
-------
Model
Pharmpy model object
Examples
--------
>>> from pharmpy.modeling import *
>>> model = load_example_model("pheno")
>>> model = set_lloq_data(model, 0, lloq=10)
See also
--------
remove_loq_data
transform_blq
"""
which_keep = _loq_mask(model, lloq=lloq, blq=blq)
df = model.dataset.copy()
dv = model.datainfo.dv_column.name
if isinstance(value, Expr) or isinstance(value, str):
value = df.eval(str(value))
df[dv] = df[dv].where(which_keep, value)
model = model.replace(dataset=df)
return model
[docs]
def set_reference_values(model: Model, refs: dict[str, Union[int, float]]):
"""Set reference values for selected columns
All values for each selected column will be replaced. For dose columns
only the values for dosing events will be replaced.
Parameters
----------
model : Model
Pharmpy model object
refs : dict
Pairs of column names and reference values
Returns
-------
Model
Pharmpy model object
Examples
--------
>>> from pharmpy.modeling import *
>>> model = load_example_model("pheno")
>>> model = set_reference_values(model, {'WGT': 0.5, 'AMT': 4.0})
>>> model.dataset
ID TIME AMT WGT APGR DV FA1 FA2
0 1 0.0 4.0 0.5 7.0 0.0 1.0 1.0
1 1 2.0 0.0 0.5 7.0 17.3 0.0 0.0
2 1 12.5 4.0 0.5 7.0 0.0 1.0 1.0
3 1 24.5 4.0 0.5 7.0 0.0 1.0 1.0
4 1 37.0 4.0 0.5 7.0 0.0 1.0 1.0
.. .. ... ... ... ... ... ... ...
739 59 108.3 4.0 0.5 6.0 0.0 1.0 1.0
740 59 120.5 4.0 0.5 6.0 0.0 1.0 1.0
741 59 132.3 4.0 0.5 6.0 0.0 1.0 1.0
742 59 144.8 4.0 0.5 6.0 0.0 1.0 1.0
743 59 146.8 0.0 0.5 6.0 40.2 0.0 0.0
<BLANKLINE>
[744 rows x 8 columns]
"""
df = model.dataset
di = model.datainfo
newcols = dict()
dtypes = dict()
for colname, value in refs.items():
if di[colname].type == 'dose':
newdose = df[colname].mask(df[colname] > 0, value)
newcols[colname] = newdose
else:
newcols[colname] = value
datatype = ColumnInfo.convert_datatype_to_pd_dtype(di[colname].datatype)
dtypes[colname] = datatype
df = df.assign(**newcols).astype(dtypes)
model = model.replace(dataset=df)
return model
class Checker:
_all_checks = (
('A1', 'Body weight has unit'),
('A2', 'Body weight has mass unit'),
('A3', 'Body weight >0 and <700kg'),
('A4', 'Age has unit'),
('A5', 'Age has time unit'),
('A6', 'Age >=0 and <130 years'),
('A7', 'Lean body mass has unit'),
('A8', 'Lean body mass has mass unit'),
('A9', 'Lean body mass >0 and <700kg'),
('A10', 'Fat free mass has unit'),
('A11', 'Fat free mass has mass unit'),
('A12', 'Fat free mass >0 and <700kg'),
('D1', 'Time after dose has unit'),
('D2', 'Time after dose has time unit'),
('D3', 'Time after dose >=0'),
('D4', 'Plasma concentration has unit'),
('D5', 'Plasma concentration has mass/volume unit'),
('D6', 'Plasma concentration >= 0'),
('I1', 'Subject identifier is unitless'),
)
def __init__(self, datainfo, dataset, verbose=False):
self.datainfo = datainfo
self.dataset = dataset
self.verbose = verbose
self.check_results = {}
self.violations = []
def set_result(self, code, test=False, violation=None, skip=False, warn=False):
if skip:
result = "SKIP"
elif test:
result = "OK"
else:
if warn:
result = "WARN"
else:
result = "FAIL"
if code not in self.check_results or (
code in self.check_results
and (
self.check_results[code] == 'SKIP'
or self.check_results[code] == 'OK'
and result in ('WARN', 'FAIL')
or self.check_results[code] == 'WARN'
and result == 'FAIL'
)
):
self.check_results[code] = result
if result in ('WARN', 'FAIL'):
self.violations.append((code, result, violation))
def check_has_unit(self, code, col):
has_unit = col.unit is not None
self.set_result(code, test=has_unit, violation=col.name, warn=True)
return has_unit
def check_is_unitless(self, code, col):
is_unitless = col.unit == Unit.unitless()
self.set_result(code, test=is_unitless, violation=col.name, warn=True)
def check_dimension(self, code, column, dim):
if column.unit is None:
self.set_result(code, skip=True)
return False
else:
dim2 = sympy.physics.units.Dimension(
sympy.physics.units.si.SI.get_dimensional_expr(column.unit._expr)
)
self.set_result(
code,
test=dim == dim2,
violation=f"Unit {column.unit} of {column.name} is not a {dim} unit",
)
return dim == dim2
def check_range(self, code, col, lower, upper, unit, lower_included=True, upper_included=True):
name = col.name
if lower == 0:
scaled_lower = lower
else:
scaled_lower = float(
sympy.physics.units.convert_to(lower * unit, col.unit._expr) / col.unit._expr
)
if upper == 0:
scaled_upper = upper
else:
scaled_upper = float(
sympy.physics.units.convert_to(upper * unit, col.unit._expr) / col.unit._expr
)
if lower_included:
lower_viol = self.dataset[name] < scaled_lower
else:
lower_viol = self.dataset[name] <= scaled_lower
if upper_included:
upper_viol = self.dataset[name] > scaled_upper
else:
upper_viol = self.dataset[name] >= scaled_upper
all_viol = lower_viol | upper_viol
violations = all_viol[all_viol]
if not violations.empty:
for i in violations.index:
self.set_result(
code,
test=False,
violation=f"{col.name} index={i} value={self.dataset[name].loc[i]}",
)
else:
self.set_result(code, test=True)
def get_dataframe(self):
codes = []
checks = []
results = []
violations = []
for code, msg in Checker._all_checks:
if code not in self.check_results:
self.check_results[code] = "SKIP"
if self.check_results[code] in ['OK', 'SKIP']:
if self.verbose:
codes.append(code)
checks.append(msg)
results.append(self.check_results[code])
violations.append(None)
else:
for viol in self.violations:
if (
viol[0] == code
and viol[1] == "FAIL"
or (viol[1] == "WARN" and self.verbose)
):
codes.append(code)
checks.append(msg)
results.append(viol[1])
violations.append(viol[2])
df = pd.DataFrame(
{'code': codes, 'check': checks, 'result': results, 'violation': violations}
)
return df
def print(self):
table = rich_table.Table(title="Dataset checks", box=rich_box.SQUARE)
table.add_column("Code")
table.add_column("Check")
table.add_column("Result")
table.add_column("Violation")
for code, msg in Checker._all_checks:
if code not in self.check_results:
self.check_results[code] = "SKIP"
if self.check_results[code] in ['OK', 'SKIP']:
if self.verbose:
table.add_row(code, msg, f'[bold green]{self.check_results[code]}', "")
else:
for viol in self.violations:
if (
viol[0] == code
and viol[1] == "FAIL"
or (viol[1] == "WARN" and self.verbose)
):
result = viol[1]
if result == "FAIL":
result = f"[bold red]{result}"
else:
result = f"[bold yellow]{result}"
table.add_row(code, msg, result, viol[2])
if table.rows: # Do not print an empty table
console = rich_console.Console()
console.print(table)
[docs]
def check_dataset(model: Model, dataframe: bool = False, verbose: bool = False):
"""Check dataset for consistency across a set of rules
Parameters
----------
model : Model
Pharmpy model object
dataframe : bool
True to return a DataFrame instead of printing to the console
verbose : bool
Print out all rules checked if True else print only failed rules
Returns
-------
pd.DataFrame
Only returns a DataFrame is dataframe=True
"""
di = model.datainfo
df = model.dataset
checker = Checker(di, df, verbose=verbose)
for col in di:
if col.descriptor == "body weight":
checker.check_has_unit("A1", col)
samedim = checker.check_dimension("A2", col, sympy.physics.units.mass)
if samedim:
checker.check_range("A3", col, 0, 700, sympy.physics.units.kg, False, False)
if col.descriptor == "age":
checker.check_has_unit("A4", col)
samedim = checker.check_dimension("A5", col, sympy.physics.units.time)
if samedim:
checker.check_range("A6", col, 0, 130, sympy.physics.units.year, True, False)
if col.descriptor == "lean body mass":
checker.check_has_unit("A7", col)
samedim = checker.check_dimension("A8", col, sympy.physics.units.mass)
if samedim:
checker.check_range("A9", col, 0, 700, sympy.physics.units.kg, False, False)
if col.descriptor == "fat free mass":
checker.check_has_unit("A10", col)
samedim = checker.check_dimension("A11", col, sympy.physics.units.mass)
if samedim:
checker.check_range("A12", col, 0, 700, sympy.physics.units.kg, False, False)
if col.descriptor == "time after dose":
checker.check_has_unit("D1", col)
samedim = checker.check_dimension("D2", col, sympy.physics.units.time)
if samedim:
checker.check_range(
"D3", col, 0, float('inf'), sympy.physics.units.second, True, False
)
if col.descriptor == "plasma concentration":
checker.check_has_unit("D4", col)
samedim = checker.check_dimension(
"D5", col, sympy.physics.units.mass / sympy.physics.units.length**3
)
if samedim:
checker.check_range(
"D6",
col,
0,
float('inf'),
sympy.physics.units.kg / sympy.physics.units.L,
True,
False,
)
if col.descriptor == "subject identifier":
checker.check_is_unitless("I1", col)
if dataframe:
return checker.get_dataframe()
else:
checker.print()
[docs]
def read_dataset_from_datainfo(
datainfo: Union[DataInfo, Path, str], datatype: Optional[str] = None
):
"""Read a dataset given a datainfo object or path to a datainfo file
Parameters
----------
datainfo : DataInfo | Path | str
A datainfo object or a path to a datainfo object
datatype : str
A string to specify dataset type
Returns
-------
pd.DataFrame
The dataset
"""
if not isinstance(datainfo, DataInfo):
datainfo = DataInfo.read_json(datainfo)
if datainfo.path is None:
raise ValueError('datainfo.path is None')
from pharmpy.model.external.nonmem.dataset import read_nonmem_dataset
from pharmpy.model.external.nonmem.parsing import filter_observations
if datatype == 'nonmem':
drop = [col.drop for col in datainfo]
df = read_nonmem_dataset(
datainfo.path,
ignore_character='@',
drop=drop,
colnames=datainfo.names,
dtype=datainfo.get_dtype_dict(),
)
# This assumes a PK model
df = filter_observations(df, datainfo)
else:
df = pd.read_csv(
datainfo.path,
sep=datainfo.separator,
dtype=datainfo.get_dtype_dict(),
float_precision='round_trip',
)
return df
def create_default_datainfo(path_or_df):
if not isinstance(path_or_df, pd.DataFrame):
path = path_absolute(path_or_df)
datainfo_path = path.with_suffix('.datainfo')
if datainfo_path.is_file():
di = DataInfo.read_json(datainfo_path)
di = di.replace(path=path)
return di
else:
with open(path) as file:
first_line = file.readline()
if ',' not in first_line:
colnames = list(pd.read_csv(path, nrows=0, sep=r'\s+'))
separator = r'\s+'
else:
colnames = list(pd.read_csv(path, nrows=0))
separator = ','
else:
colnames = path_or_df.columns
separator = None
path = None
column_info = []
for colname in colnames:
if colname == 'ID' or colname == 'L1':
info = ColumnInfo.create(colname, type='id', scale='nominal', datatype='int32')
elif colname == 'DV':
info = ColumnInfo.create(colname, type='dv')
elif colname == 'TIME':
if not set(colnames).isdisjoint({'DATE', 'DAT1', 'DAT2', 'DAT3'}):
datatype = 'nmtran-time'
else:
datatype = 'float64'
info = ColumnInfo.create(colname, type='idv', scale='ratio', datatype=datatype)
elif colname == 'EVID':
info = ColumnInfo.create(colname, type='event', scale='nominal')
elif colname == 'MDV':
if 'EVID' in colnames:
info = ColumnInfo.create(colname, type='mdv')
else:
info = ColumnInfo.create(colname, type='event', scale='nominal', datatype='int32')
elif colname == 'AMT':
info = ColumnInfo.create(colname, type='dose', scale='ratio')
elif colname == 'BLQ':
info = ColumnInfo.create(colname, type='blq', scale='nominal', datatype='int32')
elif colname == 'LLOQ':
info = ColumnInfo.create(colname, type='lloq', scale='ratio')
elif colname == 'DVID':
info = ColumnInfo.create(colname, type='dvid', scale='nominal', datatype='int32')
elif colname == 'SS':
info = ColumnInfo.create(colname, type='ss', scale='nominal', datatype='int32')
elif colname == 'II':
info = ColumnInfo.create(colname, type='ii', scale='ratio')
else:
info = ColumnInfo.create(colname)
column_info.append(info)
di = DataInfo.create(column_info, path=path, separator=separator)
return di
[docs]
def deidentify_data(
df: pd.DataFrame, id_column: str = 'ID', date_columns: Optional[list[str]] = None
):
"""Deidentify a dataset
Two operations are performed on the dataset:
1. All ID numbers are randomized from the range 1 to n
2. All columns containing dates will have the year changed
The year change is done by letting the earliest year in the dataset
be used as a reference and by maintaining leap years. The reference year
will either be 1901, 1902, 1903 or 1904 depending on its distance to the closest
preceeding leap year.
Parameters
----------
df : pd.DataFrame
A dataset
id_column : str
Name of the id column
date_columns : list
Names of all date columns
Returns
-------
pd.DataFrame
Deidentified dataset
"""
df = df.copy()
df[id_column] = pd.to_numeric(df[id_column])
resampler = resample_data(df, id_column)
df, _ = next(resampler)
if date_columns is None:
return df
for datecol in date_columns:
if pd.api.types.is_datetime64_any_dtype(df[datecol]):
pass
elif df[datecol].dtype == 'object':
# assume string
df[datecol] = pd.to_datetime(df[datecol])
else:
raise ValueError(f"Column {datecol} does not seem to contain a date")
earliest_date = df[date_columns].min().min()
# Handle leap year modulo
earliest_year_modulo = earliest_date.year % 4
reference_offset = 4 if earliest_year_modulo == 0 else earliest_year_modulo
reference_year = 1900 + reference_offset
delta = earliest_date.year - reference_year
def convert(x):
new = x.replace(year=x.year - delta)
return new
for datecol in date_columns:
df[datecol] = df[datecol].transform(convert)
return df
[docs]
def unload_dataset(model: Model):
"""Unload the dataset from a model
Parameters
----------
model : Model
Pharmpy model
Returns
-------
Model
Pharmpy model with dataset removed
Example
-------
>>> from pharmpy.modeling import load_example_model, unload_dataset
>>> model = load_example_model("pheno")
>>> model = unload_dataset(model)
>>> model.dataset is None
True
"""
model = model.replace(dataset=None)
return model
[docs]
def load_dataset(model: Model):
"""Load the dataset given datainfo
Parameters
----------
model : Model
Pharmpy model
Returns
-------
Model
Pharmpy model with dataset removed
Example
-------
>>> from pharmpy.modeling import load_example_model, load_dataset, unload_dataset
>>> model = load_example_model("pheno")
>>> model = unload_dataset(model)
>>> model.dataset is None
True
>>> model = load_dataset(model)
>>> model.dataset
ID TIME AMT WGT APGR DV FA1 FA2
0 1 0.0 25.0 1.4 7.0 0.0 1.0 1.0
1 1 2.0 0.0 1.4 7.0 17.3 0.0 0.0
2 1 12.5 3.5 1.4 7.0 0.0 1.0 1.0
3 1 24.5 3.5 1.4 7.0 0.0 1.0 1.0
4 1 37.0 3.5 1.4 7.0 0.0 1.0 1.0
.. .. ... ... ... ... ... ... ...
739 59 108.3 3.0 1.1 6.0 0.0 1.0 1.0
740 59 120.5 3.0 1.1 6.0 0.0 1.0 1.0
741 59 132.3 3.0 1.1 6.0 0.0 1.0 1.0
742 59 144.8 3.0 1.1 6.0 0.0 1.0 1.0
743 59 146.8 0.0 1.1 6.0 40.2 0.0 0.0
<BLANKLINE>
[744 rows x 8 columns]
"""
df = read_dataset_from_datainfo(model.datainfo)
model = model.replace(dataset=df)
return model
[docs]
def set_dataset(
model: Model, path_or_df: Union[str, Path, pd.DataFrame], datatype: Optional[str] = None
):
"""Load the dataset given datainfo
Parameters
----------
model : Model
Pharmpy model
path_or_df : str, Path, or pd.DataFrame
Dataset path or dataframe
datatype : str
Type of dataset (optional)
Returns
-------
Model
Pharmpy model with new dataset and updated datainfo
Example
-------
>>> from pharmpy.modeling import load_example_model, load_dataset, unload_dataset, set_dataset
>>> model = load_example_model("pheno")
>>> model = unload_dataset(model)
>>> dataset_path = model.datainfo.path
>>> model.dataset is None
True
>>> model = set_dataset(model, dataset_path, datatype='nonmem')
>>> model.dataset
ID TIME AMT WGT APGR DV FA1 FA2
0 1 0.0 25.0 1.4 7.0 0.0 1.0 1.0
1 1 2.0 0.0 1.4 7.0 17.3 0.0 0.0
2 1 12.5 3.5 1.4 7.0 0.0 1.0 1.0
3 1 24.5 3.5 1.4 7.0 0.0 1.0 1.0
4 1 37.0 3.5 1.4 7.0 0.0 1.0 1.0
.. .. ... ... ... ... ... ... ...
739 59 108.3 3.0 1.1 6.0 0.0 1.0 1.0
740 59 120.5 3.0 1.1 6.0 0.0 1.0 1.0
741 59 132.3 3.0 1.1 6.0 0.0 1.0 1.0
742 59 144.8 3.0 1.1 6.0 0.0 1.0 1.0
743 59 146.8 0.0 1.1 6.0 40.2 0.0 0.0
<BLANKLINE>
[744 rows x 8 columns]
"""
if isinstance(path_or_df, pd.DataFrame):
df = path_or_df
if datatype == 'nonmem':
di = create_default_datainfo(path_or_df)
else:
di = DataInfo.create(columns=list(df.columns.values), path=None)
else:
path = normalize_user_given_path(path_or_df)
if datatype == 'nonmem':
di = create_default_datainfo(path)
else:
di = DataInfo.create(path=path)
df = read_dataset_from_datainfo(di, datatype=datatype)
di = update_datainfo(di, df).replace(path=path)
if len(df.columns) == 1:
warnings.warn('Could only find one column, should this be another datatype?')
model = model.replace(dataset=df, datainfo=di)
return model.update_source()
[docs]
def bin_observations(
model: Model, method: Literal["equal_width", "equal_number"], nbins: int
) -> pd.Series:
"""Bin all observations on the independent variable
Available binning methods:
+---------------+-------------------------------------------------+
| Method | Description |
+===============+=================================================+
| equal_width | Bins with equal width based on the idv |
+---------------+-------------------------------------------------+
| equal_number | Bins containing an equal number of observations |
+---------------+-------------------------------------------------+
Parameters
----------
model : Model
Pharmpy model
method : str
Name of the binning method to use
nbins : int
The number of bins wanted
Returns
-------
pd.Series
A series of bin ids indexed on the original record index of the dataset
list
A list of bin edges
Example
-------
>>> from pharmpy.modeling import load_example_model, bin_observations
>>> model = load_example_model("pheno")
>>> bins, boundaries = bin_observations(model, method="equal_width", nbins=10)
>>> bins
1 0
11 2
13 0
19 1
26 3
..
719 1
727 3
729 0
736 1
743 3
Name: TIME, Length: 155, dtype: int64
>>> boundaries
array([ 0. , 39.88, 78.76, 117.64, 156.52, 195.4 , 234.28, 273.16,
312.04, 350.92, 389.8 ])
"""
df = get_and_check_dataset(model)
observations = get_observations(model, keep_index=True)
obs = df.loc[observations.index]
idv = model.datainfo.idv_column.name
sorted_idvs = obs[idv].sort_values()
method_lower = method.lower()
if method_lower == "equal_width":
bincol, boundaries = pd.cut(sorted_idvs, nbins, labels=False, retbins=True)
boundaries[0] = 0
elif method_lower == "equal_number":
bin_edges = _get_bin_edges_psn(sorted_idvs, nbins)
bincol, boundaries = pd.cut(
sorted_idvs, bin_edges, labels=False, retbins=True, include_lowest=True
)
else:
raise ValueError(f"Unknown binning method {method}")
sorted_bincol = bincol.sort_index()
return sorted_bincol, boundaries
def _get_bin_edges_psn(data, n_bins):
"""Similar to function "get_bin_ceilings_from_count" from PsN
Divide a list of data points into bins of equal count.
"""
# Create a dictionary with unique values and their indices
unique_values, value_indices = np.unique(data, return_inverse=True)
value_dict = {i: val for i, val in enumerate(unique_values)}
# Count occurrences of each unique value
obs_count = pd.Series(value_indices).value_counts().sort_index().tolist()
# Calculate the number of unique values
n_values = len(value_dict)
# Calculate the ideal count for each bin
count_per_bin = len(data) / n_bins
ideal_count = [count_per_bin] * n_bins
bin_ceilings = [0]
global_error = 0
bin_index = 0
local_error = -ideal_count[bin_index]
for value_index, obs in enumerate(obs_count):
if bin_index == len(ideal_count) - 1:
bin_ceilings.append(unique_values[n_values - 1])
break
elif local_error == -ideal_count[bin_index]:
local_error += obs
elif obs == 0:
continue
elif abs(global_error + local_error) > abs(global_error + local_error + obs) and (
n_values - value_index - 1
) > (len(ideal_count) - bin_index - 1):
local_error += obs
else:
bin_ceilings.append(unique_values[value_index - 1])
global_error += local_error
bin_index += 1
local_error = -ideal_count[bin_index] + obs
# FIXME: This is a trick to not have 0 and 0
if len(bin_ceilings) > 1 and bin_ceilings[0] == bin_ceilings[1]:
bin_ceilings[1] += 0.00000001
return bin_ceilings