thermo #

Train against thermodynamic properties.

Classes:

DataEntry –

Represents a single experimental data point.
SimulationKey –

A key used to identify a simulation.
SimulationConfig –

Configuration for a simulation to run.

Functions:

create_dataset –

Create a dataset from a list of existing data points.
extract_smiles –

Return a list of unique SMILES strings in the dataset.
default_config –

Return a default simulation configuration for the specified phase.
predict –

Predict the properties in a dataset using molecular simulation, or by reweighting

DataEntry #

Bases: TypedDict

Represents a single experimental data point.

Attributes:

type (DataType) –

The type of data point.
smiles_a (str) –

The SMILES definition of the first component.
x_a (float | None) –

The mole fraction of the first component. This must be set to 1.0 if the data
smiles_b (str | None) –

The SMILES definition of the second component if present.
x_b (float | None) –

The mole fraction of the second component if present.
temperature (float) –

The temperature at which the data point was measured.
pressure (float) –

The pressure at which the data point was measured.
value (float) –

The value of the data point.
std (float | None) –

The standard deviation of the data point if available.
units (str) –

The units of the data point.
source (str) –

The source of the data point.

type `instance-attribute` #

type: DataType

The type of data point.

smiles_a `instance-attribute` #

smiles_a: str

The SMILES definition of the first component.

x_a `instance-attribute` #

x_a: float | None

The mole fraction of the first component. This must be set to 1.0 if the data

smiles_b `instance-attribute` #

smiles_b: str | None

The SMILES definition of the second component if present.

x_b `instance-attribute` #

x_b: float | None

The mole fraction of the second component if present.

temperature `instance-attribute` #

temperature: float

The temperature at which the data point was measured.

pressure `instance-attribute` #

pressure: float

The pressure at which the data point was measured.

value `instance-attribute` #

value: float

The value of the data point.

std `instance-attribute` #

std: float | None

The standard deviation of the data point if available.

units `instance-attribute` #

units: str

The units of the data point.

source `instance-attribute` #

source: str

The source of the data point.

SimulationKey #

Bases: NamedTuple

A key used to identify a simulation.

Attributes:

smiles (tuple[str, ...]) –

The SMILES definitions of the components present in the system.
counts (tuple[int, ...]) –

The number of copies of each component present in the system.
temperature (float) –

The temperature [K] at which the simulation was run.
pressure (float | None) –

The pressure [atm] at which the simulation was run.

smiles `instance-attribute` #

smiles: tuple[str, ...]

The SMILES definitions of the components present in the system.

counts `instance-attribute` #

counts: tuple[int, ...]

The number of copies of each component present in the system.

temperature `instance-attribute` #

temperature: float

The temperature [K] at which the simulation was run.

pressure `instance-attribute` #

pressure: float | None

The pressure [atm] at which the simulation was run.

SimulationConfig `pydantic-model` #

Bases: BaseModel

Configuration for a simulation to run.

Fields:

max_mols (int)
gen_coords (GenerateCoordsConfig)
apply_hmr (bool)
equilibrate (list[MinimizationConfig | SimulationConfig])
production (SimulationConfig)
production_frequency (int)

max_mols `pydantic-field` #

max_mols: int

The maximum number of molecules to simulate.

gen_coords `pydantic-field` #

gen_coords: GenerateCoordsConfig

Configuration for generating initial coordinates.

apply_hmr `pydantic-field` #

apply_hmr: bool = False

Whether to apply hydrogen mass repartitioning.

equilibrate `pydantic-field` #

equilibrate: list[MinimizationConfig | SimulationConfig]

Configuration for equilibration simulations.

production `pydantic-field` #

production: SimulationConfig

Configuration for the production simulation.

production_frequency `pydantic-field` #

production_frequency: int

The frequency at which to write frames during production.

create_dataset #

create_dataset(*rows: DataEntry) -> Dataset

Create a dataset from a list of existing data points.

Parameters:

rows (DataEntry, default: () ) –

The data points to create the dataset from.

Returns:

Dataset –

The created dataset.

Source code in descent/targets/thermo.py

def create_dataset(*rows: DataEntry) -> datasets.Dataset:
    """Create a dataset from a list of existing data points.

    Args:
        rows: The data points to create the dataset from.

    Returns:
        The created dataset.
    """

    for row in rows:
        row["smiles_a"] = _map_smiles(row["smiles_a"])

        if row["smiles_b"] is None:
            continue

        row["smiles_b"] = _map_smiles(row["smiles_b"])

    # TODO: validate rows
    table = pyarrow.Table.from_pylist([*rows], schema=DATA_SCHEMA)

    dataset = datasets.Dataset(datasets.table.InMemoryTable(table))
    return dataset

extract_smiles #

extract_smiles(dataset: Dataset) -> list[str]

Return a list of unique SMILES strings in the dataset.

Parameters:

dataset (Dataset) –

The dataset to extract the SMILES strings from.

Returns:

list[str] –

The unique SMILES strings with full atom mapping.

Source code in descent/targets/thermo.py

def extract_smiles(dataset: datasets.Dataset) -> list[str]:
    """Return a list of unique SMILES strings in the dataset.

    Args:
        dataset: The dataset to extract the SMILES strings from.

    Returns:
        The unique SMILES strings with full atom mapping.
    """
    smiles_a = {smiles for smiles in dataset.unique("smiles_a") if smiles is not None}
    smiles_b = {smiles for smiles in dataset.unique("smiles_b") if smiles is not None}

    smiles_unique = sorted({*smiles_a, *smiles_b})
    return smiles_unique

default_config #

default_config(
    phase: Phase, temperature: float, pressure: float | None
) -> SimulationConfig

Return a default simulation configuration for the specified phase.

Parameters:

phase (Phase) –

The phase to return the default configuration for.
temperature (float) –

The temperature [K] at which to run the simulation.
pressure (float | None) –

The pressure [atm] at which to run the simulation.

Returns:

SimulationConfig –

The default simulation configuration.

Source code in descent/targets/thermo.py

def default_config(
    phase: Phase, temperature: float, pressure: float | None
) -> SimulationConfig:
    """Return a default simulation configuration for the specified phase.

    Args:
        phase: The phase to return the default configuration for.
        temperature: The temperature [K] at which to run the simulation.
        pressure: The pressure [atm] at which to run the simulation.

    Returns:
        The default simulation configuration.
    """

    if phase.lower() == "bulk":
        return _bulk_config(temperature, pressure)
    elif phase.lower() == "vacuum":
        return _vacuum_config(temperature, pressure)
    else:
        raise NotImplementedError(phase)

predict #

predict(
    dataset: Dataset,
    force_field: TensorForceField,
    topologies: dict[str, TensorTopology],
    output_dir: Path,
    cached_dir: Path | None = None,
    per_type_scales: dict[DataType, float] | None = None,
) -> tuple[Tensor, Tensor, Tensor, Tensor]

Predict the properties in a dataset using molecular simulation, or by reweighting previous simulation data.

Parameters:

dataset (Dataset) –

The dataset to predict the properties of.
force_field (TensorForceField) –

The force field to use.
topologies (dict[str, TensorTopology]) –

The topologies of the molecules present in the dataset, with keys of mapped SMILES patterns.
output_dir (Path) –

The directory to write the simulation trajectories to.
cached_dir (Path | None, default: None ) –

The (optional) directory to read cached simulation trajectories from.
per_type_scales (dict[DataType, float] | None, default: None ) –

The scale factor to apply to each data type. A default of 1.0 will be used for any data type not specified.

Source code in descent/targets/thermo.py

def predict(
    dataset: datasets.Dataset,
    force_field: smee.TensorForceField,
    topologies: dict[str, smee.TensorTopology],
    output_dir: pathlib.Path,
    cached_dir: pathlib.Path | None = None,
    per_type_scales: dict[DataType, float] | None = None,
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
    """Predict the properties in a dataset using molecular simulation, or by reweighting
    previous simulation data.

    Args:
        dataset: The dataset to predict the properties of.
        force_field: The force field to use.
        topologies: The topologies of the molecules present in the dataset, with keys
            of mapped SMILES patterns.
        output_dir: The directory to write the simulation trajectories to.
        cached_dir: The (optional) directory to read cached simulation trajectories
            from.
        per_type_scales: The scale factor to apply to each data type. A default of 1.0
            will be used for any data type not specified.
    """

    entries: list[DataEntry] = [*descent.utils.dataset.iter_dataset(dataset)]

    required_simulations, entry_to_simulation = _plan_simulations(entries, topologies)
    observables = {
        phase: {
            key: _compute_observables(
                phase, key, system, force_field, output_dir, cached_dir
            )
            for key, system in systems.items()
        }
        for phase, systems in required_simulations.items()
    }

    predicted = []
    predicted_std = []
    reference = []
    reference_std = []

    per_type_scales = per_type_scales if per_type_scales is not None else {}

    for entry, keys in zip(entries, entry_to_simulation):
        value, std = _predict(entry, keys, observables, required_simulations)

        type_scale = per_type_scales.get(entry["type"], 1.0)

        predicted.append(value * type_scale)
        predicted_std.append(torch.nan if std is None else std * abs(type_scale))

        reference.append(entry["value"] * type_scale)
        reference_std.append(
            torch.nan if entry["std"] is None else entry["std"] * abs(type_scale)
        )

    predicted = torch.stack(predicted)
    predicted_std = torch.stack(predicted_std)

    reference = smee.utils.tensor_like(reference, predicted)
    reference_std = smee.utils.tensor_like(reference_std, predicted_std)

    return reference, reference_std, predicted, predicted_std

thermo #

DataEntry #

type instance-attribute #

smiles_a instance-attribute #

x_a instance-attribute #

smiles_b instance-attribute #

x_b instance-attribute #

temperature instance-attribute #

pressure instance-attribute #

value instance-attribute #

std instance-attribute #

units instance-attribute #

source instance-attribute #

SimulationKey #

smiles instance-attribute #

counts instance-attribute #

temperature instance-attribute #

pressure instance-attribute #

SimulationConfig pydantic-model #

max_mols pydantic-field #

gen_coords pydantic-field #

apply_hmr pydantic-field #

equilibrate pydantic-field #

production pydantic-field #

production_frequency pydantic-field #

create_dataset #

extract_smiles #

default_config #

predict #

type `instance-attribute` #

smiles_a `instance-attribute` #

x_a `instance-attribute` #

smiles_b `instance-attribute` #

x_b `instance-attribute` #

temperature `instance-attribute` #

pressure `instance-attribute` #

value `instance-attribute` #

std `instance-attribute` #

units `instance-attribute` #

source `instance-attribute` #

smiles `instance-attribute` #

counts `instance-attribute` #

temperature `instance-attribute` #

pressure `instance-attribute` #

SimulationConfig `pydantic-model` #

max_mols `pydantic-field` #

gen_coords `pydantic-field` #

apply_hmr `pydantic-field` #

equilibrate `pydantic-field` #

production `pydantic-field` #

production_frequency `pydantic-field` #