"""DataInfo is a companion to the dataset. It contains metadata of the dataset"""
from __future__ import annotations
import json
from collections.abc import Mapping, Sequence
from pathlib import Path
from typing import Any, Optional, Union, cast, overload
from pharmpy import conf
from pharmpy.basic import Expr, Unit
from pharmpy.deps import pandas as pd
from pharmpy.internals.fs.path import path_absolute, path_relative_to
from pharmpy.internals.immutable import Immutable, frozenmapping
[docs]
class DataVariable(Immutable):
"""Information about one variable represented by data
For long format datasets a data column can contain multiple data variables.
Parameters
----------
name : str
Variable name. Not the same as the name of the column
type : str
Type of variable (see the "type" attribute)
scale : str
Scale of measurement (see the "scale" attribute)
count : bool
True if count data or False otherwise
properties : dict
Other properties of the variable (see the "properties" attribute)
"""
_all_types = {
'id',
'dv',
'dvid',
'idv',
'unknown',
'dose',
'rate',
'additional',
'ii',
'ss',
'event',
'covariate',
'mdv',
'compartment',
'admid',
'lloq',
'blq',
}
_all_scales = ('nominal', 'ordinal', 'interval', 'ratio')
_all_descriptors = {
None,
'age',
'body height',
'body weight',
'body surface area',
'lean body mass',
'fat free mass',
'time after dose',
'plasma concentration',
'subject identifier',
'observation identifier',
'pk measurement',
'pd measurement',
}
_all_properties = {'unit', 'categories', 'descriptor', 'molar_mass'}
def __init__(
self,
name: str,
type: str = 'unknown',
scale: str = 'ratio',
count: bool = False,
properties: Mapping[str, Any] = frozenmapping({}),
):
self._name = name
self._type = type
self._scale = scale
self._count = count
self._properties = properties
@staticmethod
def _canonicalize_properties(properties: Mapping[str, Any]) -> Mapping[str, Any]:
new = dict(properties)
for key, value in properties.items():
if key == 'categories':
new[key] = tuple(value)
elif key == 'unit':
new[key] = Unit(value)
elif key == 'descriptor':
if value not in DataVariable._all_descriptors:
raise ValueError(f"unknown descriptor {value}")
elif key == 'molar_mass':
new[key] = float(value)
else:
raise ValueError(f'Unknown DataVariable property "{key}"')
return frozenmapping(new)
[docs]
@classmethod
def create(
cls,
name: str,
type: str = 'unknown',
scale: str = 'ratio',
count: bool = False,
properties: Mapping[str, Any] = frozenmapping({}),
) -> DataVariable:
if not isinstance(name, str):
raise TypeError("Data variable name must be a string")
if type not in DataVariable._all_types:
raise ValueError(f"Unknown column type {type}")
if scale not in DataVariable._all_scales:
raise ValueError(
f"Unknown scale of measurement {scale}. Only {DataVariable._all_scales} are possible."
)
count = bool(count)
if count and scale in {'nominal', 'ordinal'}:
raise ValueError("A nominal or ordinal data variable cannot be count data")
properties = DataVariable._canonicalize_properties(properties)
return cls(
name=name,
type=type,
scale=scale,
count=count,
properties=properties,
)
[docs]
def replace(self, **kwargs) -> DataVariable:
"""Replace properties and create a new DataVariable"""
d = {key[1:]: value for key, value in self.__dict__.items()}
d.update(kwargs)
new = DataVariable.create(**d)
return new
def __eq__(self, other: Any):
if self is other:
return True
if not isinstance(other, DataVariable):
return NotImplemented
return (
self._name == other._name
and self._type == other._type
and self._scale == other._scale
and self._count == other._count
and self._properties == other._properties
)
def __hash__(self):
return hash(
(
self._name,
self._type,
self._scale,
self._count,
self._properties,
)
)
[docs]
def to_dict(self) -> dict[str, Any]:
properties = dict(self._properties)
if 'unit' in properties:
properties['unit'] = properties['unit'].serialize()
return {
'name': self._name,
'type': self._type,
'scale': self._scale,
'count': self._count,
'properties': properties,
}
[docs]
@classmethod
def from_dict(cls, d: dict[str, Any]) -> DataVariable:
properties = d.get('properties', frozenmapping({}))
if 'unit' in properties:
properties['unit'] = Unit.deserialize(properties['unit'])
if 'categories' in properties:
properties['categories'] = tuple(properties['categories'])
return cls.create(
name=d['name'],
type=d.get('type', 'unknown'),
scale=d.get('scale', 'ratio'),
count=d.get('count', False),
properties=frozenmapping(properties),
)
@property
def name(self) -> str:
"""Variable name"""
return self._name
@property
def symbol(self) -> Expr:
"""Symbol having the variable name"""
return Expr.symbol(self._name)
@property
def type(self) -> str:
"""Type of column
============ =============
type Description
============ =============
id Individual identifier. Max one per DataFrame. All values have to be unique
idv Independent variable. Max one per DataFrame.
dv Observations of the dependent variable
dvid Dependent variable ID
covariate Covariate
dose Dose amount
rate Rate of infusion
additional Number of additional doses
ii Interdose interval
ss Steady state dosing
event 0 = observation
mdv 0 = DV is observation value, 1 = DV is missing
admid Administration ID
compartment Compartment information (not yet exactly specified)
lloq Lower limit of quantification
blq Below limit of quantification indicator
unknown Unkown type. This will be the default for columns that hasn't been
assigned a type
============ =============
"""
return self._type
@property
def scale(self) -> str:
"""Scale of measurement
The statistical scale of measurement for the data variable. Can be one of
'nominal', 'ordinal', 'interval' and 'rational'.
"""
return self._scale
@property
def count(self) -> bool:
"""Does the data variable represent count data"""
return self._count
@property
def properties(self) -> Mapping[str, Any]:
"""Other properties of the DataVariable
descriptor
Kind of data
====================== ============================================
descriptor Description
====================== ============================================
age Age (since birth)
body height Human body height
body surface area Body surface area (calculated)
body weight Human body weight
lean body mass Lean body mass
fat free mass Fat free mass
time after dose Time after dose
plasma concentration Concentration of substance in blood plasma
subject identifier Unique integer identifier for a subject
observation identifier Unique integer identifier for an observation
pk measurement Any kind of PK measurement
pd measurement Any kind of PD measurement
====================== ============================================
unit
Unit of the data variable
Custom units are allowed, but units that are available in sympy.physics.units can be
recognized.
categories
All possible values of categorical data
molar_mass
The molar mass of a substance in g/mol
"""
return self._properties
[docs]
def is_categorical(self) -> bool:
"""Check if the data variable is categorical
Returns
-------
bool
True if categorical (nominal or ordinal) and False otherwise.
See also
--------
is_numerical : Check if the data variable is numerical
Examples
--------
>>> from pharmpy.model import DataVariable
>>> var1 = DataVariable.create("WGT", scale='ratio')
>>> var1.is_categorical()
False
>>> var2 = DataVariable.create("ID", scale='nominal')
>>> var2.is_categorical()
True
"""
return self.scale in {'nominal', 'ordinal'}
[docs]
def is_numerical(self) -> bool:
"""Check if the data variable is numerical
Returns
-------
bool
True if numerical (interval or ratio) and False otherwise.
See also
--------
is_categorical : Check if the data variable is categorical
Examples
--------
>>> from pharmpy.model import DataVariable
>>> var1 = DataVariable.create("WGT", scale='ratio')
>>> var1.is_numerical()
True
>>> var2 = DataVariable.create("ID", scale='nominal')
>>> var2.is_numerical()
False
"""
return self.scale in {'interval', 'ratio'}
[docs]
def get_property(self, property: str) -> Any:
"""Get a variable property with default if not defined
Parameters
----------
property : str
The property to get
Returns
-------
Any
The value of the property or its default value
Examples
--------
>>> from pharmpy.model import DataVariable
>>> var1 = DataVariable.create("WGT", properties={"unit": "kg"})
>>> var1.get_property("unit")
kilogram
>>> var2 = DataVariable.create("ID")
>>> var2.get_property("unit")
1
"""
if property not in DataVariable._all_properties:
raise ValueError(f"Unknown property {property}")
if property == 'unit':
default = Expr.integer(1)
else:
default = None
value = self.properties.get(property, default)
if value is None:
raise KeyError(f"No value and no default value for property {property}")
return value
[docs]
def set_property(self, property: str, value: Any) -> DataVariable:
"""Set the value for a property
Parameters
----------
property : str
The property to set
value : Any
Value for the property
Returns
-------
DataVariable
The updated DataVariable
Examples
--------
>>> from pharmpy.model import DataVariable
>>> var1 = DataVariable.create("WGT")
>>> var2 = var1.set_property("unit", "kg")
>>> var2.get_property("unit")
kilogram
"""
props = dict(self._properties)
props[property] = value
return self.replace(properties=props)
[docs]
def remove_property(self, property: str) -> DataVariable:
"""Remove a property
Parameters
----------
property : str
The property to remove
Returns
-------
DataVariable
The updated DataVariable
Examples
--------
>>> from pharmpy.model import DataVariable
>>> var1 = DataVariable.create("WGT", properties={"descriptor": "body weight"})
>>> var2 = var1.remove_property("body weight")
"""
props = dict(self._properties)
props.pop(property, None)
return self.replace(properties=props)
def __repr__(self):
return (
f"DataVariable(name={self._name}, type={self._type}, scale={self._scale}, "
f"count={self._count}, properties={self._properties})"
)
[docs]
class ColumnInfo(Immutable):
"""Information about one data column
Parameters
----------
name : str
Column name
variable_mapping : Mapping[int, DataVariable]
A single DataVariable or a Mapping from identifier column to the DataVariable
variable_id : str
The DataVariable identifier column
drop : bool
Should column be dropped (i.e. barred from being used)
datatype : str
Pandas datatype or special Pharmpy datatype (see the "dtype" attribute)
"""
_all_dtypes = (
'int8',
'int16',
'int32',
'int64',
'uint8',
'uint16',
'uint32',
'uint64',
'float16',
'float32',
'float64',
'float128',
'nmtran-time',
'nmtran-date',
'str',
)
[docs]
@staticmethod
def convert_pd_dtype_to_datatype(dtype) -> str:
"""Convert pandas dtype to Pharmpy datatype
Parameters
----------
dtype : str
String representing a pandas dtype
Returns
-------
str
String representing a Pharmpy datatype
Examples
--------
>>> from pharmpy.model import ColumnInfo
>>> ColumnInfo.convert_pd_dtype_to_datatype("float64")
'float64'
"""
return dtype if dtype in ColumnInfo._all_dtypes else 'str'
[docs]
@staticmethod
def convert_datatype_to_pd_dtype(datatype) -> str:
"""Convert Pharmpy datatype to pandas dtype
Parameters
----------
datatype : str
String representing a Pharmpy datatype
Returns
-------
str
String representing a pandas dtype
Examples
--------
>>> from pharmpy.model import ColumnInfo
>>> ColumnInfo.convert_datatype_to_pd_dtype("float64")
'float64'
>>> ColumnInfo.convert_datatype_to_pd_dtype("nmtran-date")
'str'
"""
if datatype.startswith('nmtran'):
return 'str'
else:
return datatype
def __init__(
self,
name: str,
variable_mapping: Union[Mapping[int, DataVariable], DataVariable],
variable_id: Optional[str] = None,
drop: bool = False,
datatype: str = "float64",
):
self._name = name
self._variable_mapping = variable_mapping
self._variable_id = variable_id
self._drop = drop
self._datatype = datatype
[docs]
@classmethod
def create(
cls,
name: str,
variable_mapping: Optional[Union[Mapping[int, DataVariable], DataVariable]] = None,
variable_id: Optional[str] = None,
drop: bool = False,
datatype: str = "float64",
) -> ColumnInfo:
if variable_mapping is None:
variable_mapping = DataVariable(name)
if not isinstance(variable_mapping, DataVariable):
types = {var.type for var in variable_mapping.values()}
if len(set(types)) != 1:
raise ValueError("All data variables need to have the same type in a column")
if variable_id is None:
raise ValueError("Need a variable_id when mapping to multiple variables")
if not isinstance(name, str):
raise TypeError("Column name must be a string")
if datatype not in ColumnInfo._all_dtypes:
raise ValueError(
f"{datatype} is not a valid datatype. Valid datatypes are {ColumnInfo._all_dtypes}"
)
if variable_id is not None and not isinstance(variable_id, str):
raise TypeError("variable_id must be a string or None")
if not isinstance(variable_mapping, DataVariable):
for key, value in variable_mapping.items():
if not isinstance(key, int) or not isinstance(value, DataVariable):
raise TypeError(
"The varaible_mapping must be either a single DataVariable"
" or a mapping from int to DataVariable"
)
variable_mapping = frozenmapping(variable_mapping)
return cls(
name=name,
datatype=datatype,
drop=drop,
variable_id=variable_id,
variable_mapping=variable_mapping,
)
[docs]
def replace(self, **kwargs) -> ColumnInfo:
"""Replace properties and create a new ColumnInfo"""
d = {key[1:]: value for key, value in self.__dict__.items()}
d.update(kwargs)
new = ColumnInfo.create(**d)
return new
def __eq__(self, other: Any):
if self is other:
return True
if not isinstance(other, ColumnInfo):
return NotImplemented
return (
self._name == other._name
and self._drop == other._drop
and self._datatype == other._datatype
and self._variable_id == other._variable_id
and self._variable_mapping == other._variable_mapping
)
def __hash__(self):
return hash(
(
self._name,
self._drop,
self._datatype,
self._variable_id,
self._variable_mapping,
)
)
[docs]
def to_dict(self) -> dict[str, Any]:
if isinstance(self._variable_mapping, DataVariable):
mapping = self._variable_mapping.to_dict()
else:
mapping = {str(key): value.to_dict() for key, value in self._variable_mapping.items()}
return {
'name': self._name,
'drop': self._drop,
'datatype': self._datatype,
'variable_id': self._variable_id,
'variable_mapping': mapping,
}
[docs]
@classmethod
def from_dict(cls, d: dict[str, Any]) -> ColumnInfo:
variable_id = d.get('variable_id', None)
if variable_id is None:
mapping = DataVariable.from_dict(d['variable_mapping'])
else:
mapping = frozenmapping(
{
int(key): DataVariable.from_dict(value)
for key, value in d['variable_mapping'].items()
}
)
return cls.create(
name=d['name'],
drop=d.get('drop', False),
datatype=d.get('datatype', 'float64'),
variable_id=variable_id,
variable_mapping=mapping,
)
@property
def name(self) -> str:
"""Column name"""
return self._name
@property
def symbol(self) -> Expr:
"""Symbol having the column name"""
return Expr.symbol(self._name)
@property
def drop(self) -> bool:
"""Should this column be dropped"""
return self._drop
@property
def datatype(self) -> str:
"""Column datatype
============ ================ ======== ================================= ===========
datatype Description Size Range NA allowed?
============ ================ ======== ================================= ===========
int8 Signed integer 8 bits -128 to +127. No
int16 Signed integer 16 bits -32,768 to +32,767. No
int32 Signed integer 32 bits -2,147,483,648 to +2,147,483,647. No
int64 Signed integer 64 bits -9,223,372,036,854,775,808 to No
9,223,372,036,854,775,807.
uint8 Unsigned integer 8 bits 0 to 256. No
uint16 Unsigned integer 16 bit 0 to 65,535. No
uint32 Unsigned integer 32 bit 0 to 4,294,967,295. No
uint64 Unsigned integer 64 bit 0 to 18,446,744,073,709,551,615 No
float16 Binary float 16 bits ≈ ±6.55×10⁴ Yes
float32 Binary float 32 bits ≈ ±3.4×10³⁸ Yes
float64 Binary float 64 bits ≈ ±1.8×10³⁰⁸ Yes
float128 Binary float 128 bits ≈ ±1.2×10⁴⁹³² Yes
nmtran-time NM-TRAN time n No
nmtran-date NM-TRAN date n No
str General string n No
============ ================ ========================================== ===========
The default, and most common datatype, is float64.
"""
return self._datatype
@property
def variable_id(self) -> Optional[str]:
"""Name of identifier column (e.g. DVID or ADMID)"""
return self._variable_id
@property
def variable_mapping(self) -> Union[Mapping[int, DataVariable], DataVariable]:
"""Mapping from value in identifier column to DataVariable"""
return self._variable_mapping
@property
def variable(self) -> DataVariable:
"""If the column represent a single DataVariable return it else raise"""
if not isinstance(self._variable_mapping, DataVariable):
raise ValueError("This ColumnInfo represents more than one DataVariable. Use indexing")
return self._variable_mapping
@property
def variables(self) -> tuple[DataVariable, ...]:
"""All datavariables defined in this column"""
if isinstance(self._variable_mapping, DataVariable):
return (self._variable_mapping,)
else:
return tuple(self._variable_mapping.values())
@property
def type(self) -> str:
"""The type of the column. See DataVariable.type
Note that all variables in one column must have the same type
"""
if isinstance(self._variable_mapping, DataVariable):
return self._variable_mapping.type
else:
return next(iter(self._variable_mapping.values())).type
def __len__(self) -> int:
if isinstance(self._variable_mapping, DataVariable):
return 1
else:
return len(self._variable_mapping)
def __getitem__(self, index) -> DataVariable:
if isinstance(self._variable_mapping, DataVariable):
raise KeyError("This ColumnInfo represents a single DataVariable. Use .variable")
if isinstance(index, int):
return self._variable_mapping[index]
else:
for var in self._variable_mapping.values():
if var.name == index:
return var
raise KeyError(f"No DataVariable named {index}")
[docs]
def is_integer(self) -> bool:
"""Check if the column datatype is integral
Returns
-------
bool
True if of integral datatype
See also
--------
is_categorical : Check if the column data is categorical
Examples
--------
>>> from pharmpy.model import ColumnInfo, DataVariable
>>> var = DataVariable.create("WGT", scale='ratio')
>>> col = ColumnInfo.create("WGT", var)
>>> col.is_integer()
False
"""
return self.datatype in [
'int8',
'int16',
'int32',
'int64',
'uint8',
'uint16',
'uint32',
'uint64',
]
def __repr__(self):
variable_names = [var.name for var in self.variables]
ser = pd.Series(
[
self._drop,
self._datatype,
self._variable_id,
', '.join(variable_names),
],
index=[
'drop',
'datatype',
'variable_id',
'variables',
],
name=self._name,
)
return ser.to_string(name=True)
[docs]
class DataInfo(Sequence, Immutable):
"""Metadata for the dataset
Can be indexed to get ColumnInfo for the columns.
Parameters
----------
columns : tuple
Tuple of ColumnInfo
path : Path
Path to dataset file
separator : str
Character or regexp separator for dataset
missing_data_token : str
Token for missing data
"""
def __init__(
self,
columns: tuple[ColumnInfo, ...] = (),
path: Optional[Path] = None,
separator: str = ',',
missing_data_token: Optional[str] = None,
):
self._columns = columns
self._path = path
self._separator = separator
if missing_data_token is None:
self._missing_data_token = conf.missing_data_token
else:
self._missing_data_token = missing_data_token
[docs]
@classmethod
def create(
cls,
columns: Optional[Union[Sequence[ColumnInfo], Sequence[str]]] = None,
path: Optional[Union[str, Path]] = None,
separator: str = ',',
missing_data_token: Optional[str] = None,
) -> DataInfo:
if columns:
if not isinstance(columns, Sequence):
raise TypeError('Argument `columns` must be iterable')
if not all(isinstance(col, str) or isinstance(col, ColumnInfo) for col in columns):
raise TypeError(
'Argument `columns` need to consist of either type `str` or `ColumnInfo`'
)
if columns is None or len(columns) == 0:
cols = ()
elif len(columns) > 0 and any(isinstance(col, str) for col in columns):
cols = tuple(
ColumnInfo.create(col, DataVariable.create(col)) if isinstance(col, str) else col
for col in columns
)
else:
cols = cast(tuple[ColumnInfo, ...], tuple(columns))
if path is not None:
path = Path(path)
if missing_data_token is None:
missing_data_token = conf.missing_data_token
colnames = [col.name for col in cols]
colnames_set = set(colnames)
if len(colnames) != len(colnames_set):
raise ValueError("Column names in a DataInfo need to be unique")
variable_ids = {col.variable_id for col in cols if col.variable_id is not None}
missing_ids = variable_ids - colnames_set
if missing_ids:
raise ValueError(f"All variable_ids must exist as columns. Missing: {missing_ids}")
return cls(
columns=cols, path=path, separator=separator, missing_data_token=str(missing_data_token)
)
[docs]
def replace(self, **kwargs) -> DataInfo:
if 'columns' in kwargs:
columns = tuple(kwargs['columns'])
else:
columns = self._columns
if 'path' in kwargs:
if kwargs['path'] is not None:
path = Path(kwargs['path'])
else:
path = None
else:
path = self._path
separator = kwargs.get('separator', self._separator)
missing_data_token = kwargs.get('missing_data_token', self._missing_data_token)
return DataInfo.create(
columns=columns,
path=path,
separator=separator,
missing_data_token=str(missing_data_token),
)
def __add__(self, other: Union[DataInfo, ColumnInfo, Sequence[ColumnInfo]]) -> DataInfo:
if isinstance(other, DataInfo):
return DataInfo.create(
columns=self._columns + other._columns, path=self.path, separator=self.separator
)
elif isinstance(other, ColumnInfo):
return DataInfo.create(
columns=self._columns + (other,), path=self.path, separator=self.separator
)
else:
return DataInfo.create(
columns=self._columns + tuple(other), path=self.path, separator=self.separator
)
def __radd__(self, other: DataInfo) -> DataInfo:
if isinstance(other, ColumnInfo):
return DataInfo.create(
columns=(other,) + self._columns, path=self.path, separator=self.separator
)
else:
return DataInfo.create(
columns=tuple(other) + self._columns, path=self.path, separator=self.separator
)
def __eq__(self, other: Any):
if self is other:
return True
if not isinstance(other, DataInfo):
return NotImplemented
if len(self) != len(other):
return False
for col1, col2 in zip(self, other):
if col1 != col2:
return False
return True
def __hash__(self):
return hash(self._columns)
def __len__(self):
return len(self._columns)
def _getindex(self, i: Any) -> int:
if isinstance(i, str):
for n, col in enumerate(self._columns):
if col.name == i:
return n
raise IndexError(f"Cannot find column {i} in DataInfo")
elif isinstance(i, int):
return i
else:
raise TypeError(f"Cannot index DataInfo by {type(i)}")
@overload
def __getitem__(self, index: Union[int, str]) -> ColumnInfo: ...
@overload
def __getitem__(self, index: Union[Sequence, slice]) -> DataInfo: ...
def __getitem__(self, index: Union[Sequence, slice, int, str]) -> Union[DataInfo, ColumnInfo]:
if isinstance(index, (int, str)):
return self._columns[self._getindex(index)]
elif isinstance(index, Sequence):
cols = []
for ind in index:
i = self._getindex(ind)
cols.append(self._columns[i])
return DataInfo.create(columns=cols)
elif isinstance(index, slice):
return DataInfo.create(self._columns[index], path=self._path, separator=self._separator)
else:
# NOTE: To trigger the exception
return self._columns[self._getindex(index)]
def __contains__(self, value: Any) -> bool:
for col in self:
if col == value or col.name == value:
return True
return False
@property
def path(self) -> Optional[Path]:
r"""Path of dataset file
Examples
--------
>>> from pharmpy.modeling import load_example_model
>>> model = load_example_model("pheno")
>>> str(model.datainfo.path).replace('\\', '/') # doctest: +ELLIPSIS
'.../pharmpy/internals/example_models/pheno.dta'
"""
return self._path
@property
def separator(self) -> str:
"""Separator for dataset file
Can be a single character or a regular expression
string.
"""
return self._separator
@property
def missing_data_token(self) -> str:
"""Token for missing data"""
return self._missing_data_token
@property
def typeix(self) -> TypeIndexer:
"""Type indexer
Example
-------
>>> from pharmpy.modeling import load_example_model
>>> model = load_example_model("pheno")
>>> model.datainfo.typeix['covariate'].names
['WGT', 'APGR']
"""
return TypeIndexer(self)
[docs]
def set_column(self, col: ColumnInfo) -> DataInfo:
"""Set ColumnInfo of an existing column of the same name
Parameters
----------
col : ColumnInfo
New ColumnInfo
Returns
-------
DataInfo
Updated DataInfo
"""
newcols = []
for cur in self:
if cur.name != col.name:
newcols.append(cur)
else:
newcols.append(col)
return self.replace(columns=newcols)
@property
def id_column(self) -> ColumnInfo:
"""The id column
Examples
--------
>>> from pharmpy.modeling import load_example_model
>>> model = load_example_model("pheno")
>>> model.datainfo.id_column.name
'ID'
"""
return self.typeix['id'][0]
def _set_column_type(self, name: str, type: str) -> DataInfo:
for i, col in enumerate(self):
if col.name != name and col.type == type:
raise ValueError(
f"Cannot set new {type} column: column {col.name} already has type {type}"
)
for i, col in enumerate(self):
if col.name == name:
mycol = col
ind = i
break
else:
raise IndexError(f"No column {name} in DataInfo")
var_mapping = mycol.variable_mapping
if isinstance(var_mapping, DataVariable):
new_mapping = var_mapping.replace(type=type)
else:
new_mapping = dict(var_mapping)
for key, value in new_mapping.items():
new_variable = value.replace(type=type)
new_mapping[key] = new_variable
newcol = mycol.replace(variable_mapping=new_mapping)
cols = self._columns[0:ind] + (newcol,) + self._columns[ind + 1 :]
return DataInfo.create(cols, path=self._path, separator=self._separator)
[docs]
def set_id_column(self, name: str) -> DataInfo:
return self._set_column_type(name, 'id')
@property
def dv_column(self) -> ColumnInfo:
"""The dv column
Examples
--------
>>> from pharmpy.modeling import load_example_model
>>> model = load_example_model("pheno")
>>> model.datainfo.dv_column.name
'DV'
"""
return self.typeix['dv'][0]
[docs]
def set_dv_column(self, name: str) -> DataInfo:
return self._set_column_type(name, 'dv')
@property
def idv_column(self) -> ColumnInfo:
"""The idv column
Examples
--------
>>> from pharmpy.modeling import load_example_model
>>> model = load_example_model("pheno")
>>> model.datainfo.idv_column.name
'TIME'
"""
return self.typeix['idv'][0]
[docs]
def set_idv_column(self, name: str) -> DataInfo:
return self._set_column_type(name, 'idv')
@property
def names(self) -> list[str]:
"""All column names
Examples
--------
>>> from pharmpy.modeling import load_example_model
>>> model = load_example_model("pheno")
>>> model.datainfo.names
['ID', 'TIME', 'AMT', 'WGT', 'APGR', 'DV', 'FA1', 'FA2']
"""
return [col.name for col in self._columns]
@property
def symbols(self) -> list[Expr]:
"""Symbols for all columns
Examples
--------
>>> from pharmpy.modeling import load_example_model
>>> model = load_example_model("pheno")
>>> model.datainfo.symbols
[ID, TIME, AMT, WGT, APGR, DV, FA1, FA2]
"""
return [col.symbol for col in self._columns]
@property
def types(self) -> list[str]:
"""All column types"""
return [col.type for col in self._columns]
[docs]
def set_types(self, value: Union[list[str], str]) -> DataInfo:
"""Set types for all columns
Parameters
----------
value : list or str
Types to set. If only one this will be broadcast
Return
------
DataInfo
Updated datainfo
"""
if isinstance(value, str):
value = [value]
if len(value) == 1:
value *= len(self)
if len(value) != len(self):
raise ValueError(
"Length mismatch. "
"Can only set the same number of names as columns or 1 for broadcasting"
)
newcols = []
for v, col in zip(value, self._columns):
if isinstance(col._variable_mapping, DataVariable):
newvar = col.variable.replace(type=v)
newcol = col.replace(variable_mapping=newvar)
else:
new_mapping = {}
for key, var in col._variable_mapping.items():
newvar = var.replace(type=v)
new_mapping[key] = newvar
newcol = col.replace(variable_mapping=new_mapping)
newcols.append(newcol)
return DataInfo.create(columns=newcols, path=self._path, separator=self._separator)
[docs]
def find_single_column_name(self, type: str, default: Optional[str] = None) -> str:
"""Find name of single column given type
Finds single column name with a given type, else provided default. Raises
if more than one column is found or if no column is found and no default is
given.
Parameters
----------
type : str
Column type
default : Optional[str]
Default if column type is not found
Return
------
str
Name of column
"""
try:
col = self.typeix[type]
except IndexError:
if default:
return default
raise ValueError(f'Colum of type {type} not found and no default given')
if len(col) > 1:
raise ValueError(f'More than one column found: {col.names}')
return col[0].name
[docs]
def get_dtype_dict(self) -> dict[str, str]:
"""Create a dictionary from column names to pandas dtypes
This can be used as input to some pandas functions to convert
column to the correct pandas dtype.
Returns
-------
dict
Column name to pandas dtype
Examples
--------
>>> from pharmpy.modeling import *
>>> model = load_example_model("pheno")
>>> model.datainfo.get_dtype_dict()
{'ID': 'int32',
'TIME': 'float64',
'AMT': 'float64',
'WGT': 'float64',
'APGR': 'float64',
'DV': 'float64',
'FA1': 'float64',
'FA2': 'float64'}
"""
return {
col.name: (
col.datatype if not col.drop and not col.datatype.startswith('nmtran') else 'str'
)
for col in self
}
[docs]
def to_dict(self) -> dict[str, Any]:
return self._to_dict(path=None)
[docs]
@classmethod
def from_dict(cls, d: dict[str, Any]) -> DataInfo:
columns = tuple(ColumnInfo.from_dict(col) for col in d['columns'])
return cls.create(
columns=columns,
path=None if d['path'] is None else Path(d['path']),
separator=d['separator'],
missing_data_token=d['missing_data_token'],
)
def _to_dict(self, path: Optional[str]) -> dict[str, Any]:
columns = [col.to_dict() for col in self._columns]
return {
"columns": columns,
"path": None if path is None else str(path),
"separator": self._separator,
"missing_data_token": self._missing_data_token,
}
[docs]
def to_json(self, path: Optional[Union[Path, str]] = None):
if path is None:
d = self._to_dict(str(self.path) if self.path is not None else None)
else:
d = self._to_dict(
str(path_relative_to(Path(path).parent, self.path))
if self.path is not None
else None
)
d['__version__'] = 1
if path is None:
return json.dumps(d)
else:
with open(path, 'w') as fp:
json.dump(d, fp)
@staticmethod
def _populate_dict_with_defaults(d: dict[str, Any]):
def _defaults_in_data_variable(variable):
if 'type' not in variable:
variable['type'] = 'unknown'
if 'count' not in variable:
variable['count'] = False
if 'properties' not in variable:
variable['properties'] = {}
if 'scale' not in variable:
variable['scale'] = 'ratio'
if 'path' not in d:
d['path'] = None
if 'missing_data_token' not in d:
d['missing_data_token'] = None
for col in d['columns']:
if 'variable_id' not in col:
col['variable_id'] = None
if 'drop' not in col:
col['drop'] = False
if 'datatype' not in col:
col['datatype'] = 'float64'
variable_mapping = col['variable_mapping']
first_key = next(iter(variable_mapping.keys()))
try:
int(first_key)
except ValueError:
is_single = True
else:
is_single = False
if is_single:
_defaults_in_data_variable(variable_mapping)
else:
for variable in variable_mapping.values():
_defaults_in_data_variable(variable)
[docs]
@staticmethod
def from_json(s: str) -> DataInfo:
"""Create DataInfo from JSON string
Parameters
----------
s : str
JSON string
Return
------
DataInfo
Created DataInfo object
"""
d = json.loads(s)
del d['__version__']
DataInfo._populate_dict_with_defaults(d)
di = DataInfo.from_dict(d)
return di
[docs]
@staticmethod
def read_json(path: Union[Path, str]) -> DataInfo:
"""Read DataInfo from JSON file
Parameters
----------
path : Path or str
Path to JSON datainfo file
Return
------
DataInfo
Created DataInfo object
"""
with open(path, 'r') as fp:
s = fp.read()
di = DataInfo.from_json(s)
return (
di
if di.path is None or di.path.is_absolute()
else di.replace(path=path_absolute(Path(path).parent / di.path))
)
@property
def variables(self) -> list[DataVariable]:
"""A list of all data variables in order"""
variables = []
for col in self._columns:
if isinstance(col._variable_mapping, DataVariable):
variables.append(col.variable)
else:
variables += list(col._variable_mapping.values())
return variables
def __repr__(self):
colnames = []
drop = []
datatype = []
for col in self._columns:
colnames += [col.name] * len(col)
drop += [col.drop] * len(col)
datatype += [col.datatype] * len(col)
variables = self.variables
varnames = [var.name for var in variables]
types = [var.type for var in variables]
scales = [var.scale for var in variables]
count = [var.count for var in variables]
properties = [var.properties for var in variables]
df = pd.DataFrame(
{
'name': colnames,
'variable': varnames,
'type': types,
'scale': scales,
'count': count,
'drop': drop,
'datatype': datatype,
'properties': properties,
}
)
return df.to_string(index=False)
[docs]
def find_column_by_property(self, property: str, value: Any) -> Optional[ColumnInfo]:
"""Find a single column having a property/value pair
Returns None if more than one column have the pair, if no column
has the pair or if not all variables of a column have the pair.
"""
found = None
for col in self:
for var in col.variables:
if var.properties.get(property, None) != value:
break
else:
if found is None:
found = col
else:
return None
return found
class TypeIndexer:
def __init__(self, obj):
self._obj = obj
def __getitem__(self, i) -> DataInfo:
cols = [col for col in self._obj if col.type == i and not col.drop]
if not cols:
raise IndexError(f"No columns of type {i} available")
return DataInfo.create(cols)