Source code for mchammer.data_containers.wang_landau_data_container

"""Definition of the Wang-Landau data container class."""

from warnings import warn
from collections import Counter, OrderedDict
from typing import Any, BinaryIO, Dict, List, Optional, TextIO, Tuple, Union

import numpy as np
import pandas as pd

from ase.units import kB
from ase import Atoms
from pandas import DataFrame, concat as pd_concat

from icet import ClusterSpace
from .base_data_container import BaseDataContainer


[docs] class WangLandauDataContainer(BaseDataContainer): """ Data container for storing information concerned with :ref:`Wang-Landau simulation <wang_landau_ensemble>` performed with :program:`mchammer`. Parameters ---------- structure Reference atomic structure associated with the data container. ensemble_parameters Parameters associated with the underlying ensemble. metadata Metadata associated with the data container. """ def _update_last_state(self, last_step: int, occupations: List[int], accepted_trials: int, random_state: tuple, fill_factor: float, fill_factor_history: Dict[int, float], entropy_history: Dict[int, Dict[int, float]], histogram=Dict[int, int], entropy=Dict[int, float]): """Updates last state of the Wang-Landau simulation. Parameters ---------- last_step Last trial step. occupations Occupation vector observed during the last trial step. accepted_trial Number of current accepted trial steps. random_state Tuple representing the last state of the random generator. fill_factor Fill factor of Wang-Landau algorithm. fill_factor_history Evolution of the fill factor of Wang-Landau algorithm (key=MC trial step, value=fill factor). entropy_history Evolution of the (relative) entropy accumulated during Wang-Landau simulation (key=MC trial step, value=(key=bin, value=entropy)). histogram Histogram of states visited during Wang-Landau simulation. entropy (Relative) entropy accumulated during Wang-Landau simulation. """ super()._update_last_state( last_step=last_step, occupations=occupations, accepted_trials=accepted_trials, random_state=random_state) self._last_state['fill_factor'] = fill_factor self._last_state['fill_factor_history'] = fill_factor_history self._last_state['entropy_history'] = entropy_history self._last_state['histogram'] = histogram self._last_state['entropy'] = entropy @property def fill_factor(self) -> float: """ Final value of the fill factor in the Wang-Landau algorithm. """ return float(self._last_state['fill_factor']) @property def fill_factor_history(self) -> DataFrame: """ Evolution of the fill factor in the Wang-Landau algorithm. """ return DataFrame({'mctrial': list(self._last_state['fill_factor_history'].keys()), 'fill_factor': list(self._last_state['fill_factor_history'].values())})
[docs] def get(self, *tags: str, fill_factor_limit: float = None) \ -> Union[np.ndarray, List[Atoms], Tuple[np.ndarray, List[Atoms]]]: """Returns the accumulated data for the requested observables, including configurations stored in the data container. The latter can be achieved by including ``'trajectory'`` as one of the tags. Parameters ---------- tags Names of the requested properties. fill_factor_limit Return data recorded up to the point when the specified fill factor limit was reached, or ``None`` if the entropy history is empty or the last fill factor is above the limit; otherwise return all data. Raises ------ ValueError If :attr:`tags` is empty. ValueError If observables are requested that are not in the data container. Examples -------- Below the :func:`get` method is illustrated but first we require a data container. >>> from ase import Atoms >>> from icet import ClusterExpansion, ClusterSpace >>> from mchammer.calculators import ClusterExpansionCalculator >>> from mchammer.ensembles import WangLandauEnsemble >>> # prepare cluster expansion >>> prim = Atoms('Au', positions=[[0, 0, 0]], cell=[1, 1, 10], pbc=True) >>> cs = ClusterSpace(prim, cutoffs=[1.1], chemical_symbols=['Ag', 'Au']) >>> ce = ClusterExpansion(cs, [0, 0, 2]) >>> # prepare initial configuration >>> structure = prim.repeat((4, 4, 1)) >>> for k in range(8): ... structure[k].symbol = 'Ag' >>> # set up and run Wang-Landau simulation >>> calculator = ClusterExpansionCalculator(structure, ce) >>> mc = WangLandauEnsemble(structure=structure, ... calculator=calculator, ... energy_spacing=1, ... dc_filename='ising_2d_run.dc', ... fill_factor_limit=0.3) >>> mc.run(number_of_trial_steps=len(structure)*3000) # in practice one requires more steps We can now access the data container by reading it from file by using the :func:`read` method. For the purpose of this example, however, we access the data container associated with the ensemble directly. >>> dc = mc.data_container The following lines illustrate how to use the :func:`get` method for extracting data from the data container. >>> # obtain all values of the potential represented by >>> # the cluster expansion and the MC trial step along the >>> # trajectory >>> import matplotlib.pyplot as plt >>> s, p = dc.get('mctrial', 'potential') >>> _ = plt.plot(s, p) >>> # as above but this time only included data recorded up to >>> # the point when the fill factor reached below 0.6 >>> s, p = dc.get('mctrial', 'potential', fill_factor_limit=0.6) >>> _ = plt.plot(s, p) >>> plt.show(block=False) >>> # obtain configurations along the trajectory along with >>> # their potential >>> p, confs = dc.get('potential', 'trajectory') """ if len(tags) == 0: raise TypeError('Missing tags argument') local_tags = ['occupations' if tag == 'trajectory' else tag for tag in tags] for tag in local_tags: if tag in 'mctrial': continue if tag not in self.observables: raise ValueError('No observable named {} in data container'.format(tag)) # collect data mctrials = [row_dict['mctrial'] for row_dict in self._data_list] data = pd.DataFrame.from_records(self._data_list, index=mctrials, columns=local_tags) if fill_factor_limit is not None: # only include data for fill factors up to the limit df_ffh = self.fill_factor_history.astype( {'mctrial': np.int64, 'fill_factor': np.float64}) mctrial_last = df_ffh.loc[ df_ffh.fill_factor <= fill_factor_limit].mctrial.min() data = data.loc[data.index <= mctrial_last] data.dropna(inplace=True) # handling of trajectory def occupation_to_atoms(occupation): structure = self.structure.copy() structure.numbers = occupation return structure data_list = [] for tag in local_tags: if tag == 'occupations': traj = [occupation_to_atoms(o) for o in data['occupations']] data_list.append(traj) else: data_list.append(data[tag].values) if len(data_list) > 1: return tuple(data_list) else: return data_list[0]
[docs] def get_entropy(self, fill_factor_limit: float = None) -> DataFrame: """Returns the (relative) entropy from this data container accumulated during a :ref:`Wang-Landau simulation <wang_landau_ensemble>`. Returns ``None`` if the data container does not contain the required information. Parameters ---------- fill_factor_limit Return the entropy recorded up to the point when the specified fill factor limit was reached, or ``None`` if the entropy history is empty or the last fill factor is above the limit. Otherwise return the entropy for the last state. """ if 'entropy' not in self._last_state: warn('There is no entropy information in the data container.') return None entropy = self._last_state['entropy'] if fill_factor_limit is not None: if 'entropy_history' not in self._last_state or \ len(self._last_state['entropy_history']) == 0: warn('The entropy history is empty.') return None if self._last_state['fill_factor'] > fill_factor_limit: warn('The last fill factor {} is higher than the limit' ' {}.'.format(self.fill_factor, fill_factor_limit)) return None for step, fill_factor in self._last_state['fill_factor_history'].items(): if fill_factor <= fill_factor_limit: entropy = self._last_state['entropy_history'][step] break # compile entropy into DataFrame energy_spacing = self.ensemble_parameters['energy_spacing'] df = DataFrame(data={'energy': energy_spacing * np.array(list(entropy.keys())), 'entropy': np.array(list(entropy.values()))}, index=list(entropy.keys())) # shift entropy for numerical stability df['entropy'] -= np.min(df['entropy']) return df
[docs] def get_histogram(self) -> DataFrame: """Returns the histogram from this data container accumulated since the last update of the fill factor. Returns ``None`` if the data container does not contain the required information. """ if 'histogram' not in self._last_state: return None # compile histogram into DataFrame histogram = self._last_state['histogram'] energy_spacing = self.ensemble_parameters['energy_spacing'] df = DataFrame(data={'energy': energy_spacing * np.array(list(histogram.keys())), 'histogram': np.array(list(histogram.values()))}, index=list(histogram.keys())) return df
[docs] @classmethod # todo: cls and the return should be type hinted as BaseDataContainer. # Unfortunately, this requires from __future__ import annotations, which # in turn requires Python 3.8. def read(cls, infile: Union[str, BinaryIO, TextIO], old_format: bool = False): """Reads data container from file. Parameters ---------- infile file from which to read old_format If true use old json format to read runtime data; default to false Raises ------ FileNotFoundError if file is not found (str) ValueError if file is of incorrect type (not a tarball) """ dc = super(WangLandauDataContainer, cls).read(infile=infile, old_format=old_format) for tag, value in dc._last_state.items(): if tag in ['histogram', 'entropy', 'fill_factor_history', 'entropy_history']: # the following accounts for the fact that the keys of dicts # are converted to str when writing to json and have to # converted back into numerical values dc._last_state[tag] = {} for key, val in value.items(): if isinstance(val, dict): val = {int(k): v for k, v in val.items()} dc._last_state[tag][int(key)] = val return dc
[docs] def get_density_of_states_wl(dcs: Union[WangLandauDataContainer, Dict[Any, WangLandauDataContainer]], fill_factor_limit: float = None) \ -> Tuple[DataFrame, dict]: """Returns a pandas DataFrame with the total density of states from a :ref:`Wang-Landau simulation <wang_landau_ensemble>`. If a dict of data containers is provided the function also returns a dictionary that contains the standard deviation between the entropy of neighboring data containers in the overlap region. These errors should be small compared to the variation of the entropy across each bin. The function can handle both a single data container and a dict thereof. In the latter case the data containers must cover a contiguous energy range and must at least partially overlap. Parameters ---------- dcs Data container(s), from which to extract the density of states. fill_factor_limit Calculate the density of states using the entropy recorded up to the point when the specified fill factor limit was reached. Otherwise return the density of states for the last state. Raises ------ TypeError If :attr:`dcs` does not correspond to not a single (dictionary) of data container(s) from which the entropy can retrieved. ValueError If the data container does not contain entropy information. ValueError If a fill factor limit has been provided and the data container either does not contain information about the entropy history or if the last fill factor is higher than the specified limit. ValueError If multiple data containers are provided and there are inconsistencies with regard to basic simulation parameters such as system size or energy spacing. ValueError If multiple data containers are provided and there is at least one energy region without overlap. """ # preparations if isinstance(dcs, WangLandauDataContainer): # fetch raw entropy data from data container df = dcs.get_entropy(fill_factor_limit) if df is None: raise ValueError('Entropy information could not be retrieved from' ' the data container {}.'.format(dcs)) errors = None if len(dcs.fill_factor_history) == 0 or dcs.fill_factor > 1e-4: warn('The data container appears to contain data from an' ' underconverged Wang-Landau simulation.') elif isinstance(dcs, dict) and isinstance(dcs[next(iter(dcs))], WangLandauDataContainer): # minimal consistency checks tags = list(dcs.keys()) tagref = tags[0] dcref = dcs[tagref] for tag in tags: dc = dcs[tag] if len(dc.structure) != len(dcref.structure): raise ValueError('Number of atoms differs between data containers ({}: {}, {}: {})' .format(tagref, dcref.ensemble_parameters['n_atoms'], tag, dc.ensemble_parameters['n_atoms'])) for param in ['energy_spacing', 'trial_move']: if dc.ensemble_parameters[param] != dcref.ensemble_parameters[param]: raise ValueError('{} differs between data containers ({}: {}, {}: {})' .format(param, tagref, dcref.ensemble_parameters['n_atoms'], tag, dc.ensemble_parameters['n_atoms'])) if len(dc.fill_factor_history) == 0 or dc.fill_factor > 1e-4: warn('Data container {} appears to contain data from an' ' underconverged Wang-Landau simulation.'.format(tag)) # fetch raw entropy data from data containers entropies = {} for tag, dc in dcs.items(): entropies[tag] = dc.get_entropy(fill_factor_limit) if entropies[tag] is None: raise ValueError('Entropy information could not be retrieved' ' from the data container {}.'.format(dc)) # sort entropies by energy entropies = OrderedDict(sorted(entropies.items(), key=lambda row: row[1].energy.iloc[0])) # line up entropy data errors = {} tags = list(entropies.keys()) for tag1, tag2 in zip(tags[:-1], tags[1:]): df1 = entropies[tag1] df2 = entropies[tag2] if all(df2.energy.isin(df1.energy)): warn('Window {} is a subset of {}'.format(tag2, tag1)) left_lim = np.min(df2.energy) right_lim = np.max(df1.energy) if left_lim >= right_lim: raise ValueError('No overlap in the energy range {}...{}.\n' .format(right_lim, left_lim) + ' The closest data containers have tags "{}" and "{}".' .format(tag1, tag2)) df1_ = df1[(df1.energy >= left_lim) & (df1.energy <= right_lim)] df2_ = df2[(df2.energy >= left_lim) & (df2.energy <= right_lim)] offset = (df2_.entropy - df1_.entropy).mean() errors['{}-{}'.format(tag1, tag2)] = (df2_.entropy - df1_.entropy).std() entropies[tag2].entropy = entropies[tag2].entropy - offset # compile entropy over the entire energy range data: Dict[float, float] = {} indices = {} counts = Counter() for df in entropies.values(): for index, en, ent in zip(df.index, df.energy, df.entropy): data[en] = data.get(en, 0) + ent counts[en] += 1 indices[en] = index for en in data: data[en] = data[en] / counts[en] # center entropy to prevent possible numerical issues entmin = np.min(list(data.values())) df = DataFrame(data={'energy': np.array(list(data.keys())), 'entropy': np.array(np.array(list(data.values()))) - entmin}, index=list(indices.values())) else: raise TypeError('dcs ({}) must be a data container with entropy data' ' or be a list of data containers' .format(type(dcs))) # density of states S_max = df.entropy.max() df['density'] = np.exp(df.entropy - S_max) / np.sum(np.exp(df.entropy - S_max)) return df, errors
def _extract_filter_data(dc: BaseDataContainer, columns_to_keep: List[str], fill_factor_limit: float = None) -> DataFrame: """ Extract data from a data container and filter the content. Parameters ---------- dc Data container from which to extract the data. columns_to_keep List of requested properties. fill_factor_limit Only include data recorded up to the point when the specified fill factor limit was reached when computing averages. Otherwise include all data. """ df = dc.data if fill_factor_limit is not None: # only include data for fill factors up to the limit df_ffh = dc.fill_factor_history.astype( {'mctrial': np.int64, 'fill_factor': np.float64}) mctrial_last = df_ffh.loc[ df_ffh.fill_factor <= fill_factor_limit].mctrial.min() df = df.loc[df.mctrial <= mctrial_last] return df.filter(columns_to_keep)
[docs] def get_average_observables_wl(dcs: Union[WangLandauDataContainer, Dict[Any, WangLandauDataContainer]], temperatures: List[float], observables: List[str] = None, boltzmann_constant: float = kB, fill_factor_limit: float = None) -> DataFrame: """Returns the average and the standard deviation of the energy from a :ref:`Wang-Landau simulation <wang_landau_ensemble>` for the temperatures specified. If the :attr:`observables` keyword argument is specified the function will also return the mean and standard deviation of the specified observables. Parameters ---------- dcs Data container(s) from which to extract density of states as well as observables. temperatures Temperatures at which to compute the averages. observables Observables for which to compute averages; the observables must refer to fields in the data container. boltzmann_constant Boltzmann constant :math:`k_B` in appropriate units, i.e., units that are consistent with the underlying cluster expansion and the temperature units [default: eV/K]. fill_factor_limit Use data recorded up to the point when the specified fill factor limit was reached when computing averages. Otherwise use data for the last state. Raises ------ ValueError If the data container(s) do(es) not contain entropy data from Wang-Landau simulation. ValueError If data container(s) do(es) not contain requested observable. """ def check_observables(dc: WangLandauDataContainer, observables: Optional[List[str]]) -> None: """ Helper function that checks that observables are available in data frame. """ if observables is None: return for obs in observables: if obs not in dc.data.columns: raise ValueError('Observable ({}) not in data container.\n' 'Available observables: {}'.format(obs, dc.data.columns)) # preparation of observables columns_to_keep = ['potential', 'density'] if observables is not None: columns_to_keep.extend(observables) # check that observables are available in data container # and prepare comprehensive data frame with relevant information if isinstance(dcs, WangLandauDataContainer): check_observables(dcs, observables) df_combined = _extract_filter_data(dcs, columns_to_keep, fill_factor_limit) dcref = dcs elif isinstance(dcs, dict) and isinstance(dcs[next(iter(dcs))], WangLandauDataContainer): dfs = [] for dc in dcs.values(): check_observables(dc, observables) dfs.append(_extract_filter_data(dc, columns_to_keep, fill_factor_limit)) df_combined = pd_concat([df for df in dfs], ignore_index=True) dcref = list(dcs.values())[0] else: raise TypeError('dcs ({}) must be a data container with entropy data' ' or be a list of data containers' .format(type(dcs))) # fetch entropy and density of states from data container(s) df_density, _ = get_density_of_states_wl(dcs, fill_factor_limit) # compute density for each row in data container if observable averages # are to be computed if observables is not None: energy_spacing = dcref.ensemble_parameters['energy_spacing'] # NOTE: we rely on the indices of the df_density DataFrame to # correspond to the energy scale! This is expected to be handled in # the get_density_of_states function. bins = list(np.array(np.round(df_combined.potential / energy_spacing), dtype=int)) data_density = [dens / bins.count(k) for k, dens in df_density.density[bins].items()] enref = np.min(df_density.energy) averages = [] for temperature in temperatures: # mean and standard deviation of energy boltz = np.exp(- (df_density.energy - enref) / temperature / boltzmann_constant) sumint = np.sum(df_density.density * boltz) en_mean = np.sum(df_density.energy * df_density.density * boltz) / sumint en_std = np.sum(df_density.energy ** 2 * df_density.density * boltz) / sumint en_std = np.sqrt(en_std - en_mean ** 2) record = {'temperature': temperature, 'potential_mean': en_mean, 'potential_std': en_std} # mean and standard deviation of other observables if observables is not None: boltz = np.exp(- (df_combined.potential - enref) / temperature / boltzmann_constant) sumint = np.sum(data_density * boltz) for obs in observables: obs_mean = np.sum(data_density * boltz * df_combined[obs]) / sumint obs_std = np.sum(data_density * boltz * df_combined[obs] ** 2) / sumint obs_std = np.sqrt(obs_std - obs_mean ** 2) record['{}_mean'.format(obs)] = obs_mean record['{}_std'.format(obs)] = obs_std averages.append(record) return DataFrame.from_dict(averages)
[docs] def get_average_cluster_vectors_wl(dcs: Union[WangLandauDataContainer, dict], cluster_space: ClusterSpace, temperatures: List[float], boltzmann_constant: float = kB, fill_factor_limit: float = None) -> DataFrame: """Returns the average cluster vectors from a :ref:`Wang-Landau simulation <wang_landau_ensemble>` for the temperatures specified. Parameters ---------- dcs Data container(s), from which to extract density of states as well as observables. cluster_space Cluster space to use for calculation of cluster vectors. temperatures Temperatures at which to compute the averages. boltzmann_constant Boltzmann constant :math:`k_B` in appropriate units, i.e., units that are consistent with the underlying cluster expansion and the temperature units [default: eV/K]. fill_factor_limit Use data recorded up to the point when the specified fill factor limit was reached when computing the average cluster vectors. Otherwise use data for the last state. Raises ------ ValueError If the data container(s) do(es) not contain entropy data from Wang-Landau simulation. """ # fetch potential and structures if isinstance(dcs, WangLandauDataContainer): potential, trajectory = dcs.get('potential', 'trajectory', fill_factor_limit=fill_factor_limit) energy_spacing = dcs.ensemble_parameters['energy_spacing'] elif isinstance(dcs, dict) and isinstance(dcs[next(iter(dcs))], WangLandauDataContainer): potential, trajectory = [], [] for dc in dcs.values(): p, t = dc.get('potential', 'trajectory', fill_factor_limit=fill_factor_limit) potential.extend(p) trajectory.extend(t) energy_spacing = list(dcs.values())[0].ensemble_parameters['energy_spacing'] potential = np.array(potential) else: raise TypeError('dcs ({}) must be a data container with entropy data' ' or be a list of data containers' .format(type(dcs))) # fetch entropy and density of states from data container(s) df_density, _ = get_density_of_states_wl(dcs, fill_factor_limit) # compute weighted density and cluster vector for each bin in energy # range; the weighted density is the total density divided by the number # of structures that fall in the respective bin # NOTE: the following code relies on the indices of the df_density # DataFrame to correspond to the energy scale. This is expected to be # handled in the get_density_of_states function. cvs = [] weighted_density = [] bins = list(np.array(np.round(potential / energy_spacing), dtype=int)) for k, structure in zip(bins, trajectory): cvs.append(cluster_space.get_cluster_vector(structure)) weighted_density.append(df_density.density[k] / bins.count(k)) # compute mean and standard deviation (std) of temperature weighted # cluster vector averages = [] enref = np.min(potential) for temperature in temperatures: boltz = np.exp(- (potential - enref) / temperature / boltzmann_constant) sumint = np.sum(weighted_density * boltz) cv_mean = np.array([np.sum(weighted_density * boltz * cv) / sumint for cv in np.transpose(cvs)]) cv_std = np.array([np.sum(weighted_density * boltz * cv ** 2) / sumint for cv in np.transpose(cvs)]) cv_std = np.sqrt(cv_std - cv_mean ** 2) record = {'temperature': temperature, 'cv_mean': cv_mean, 'cv_std': cv_std} averages.append(record) return DataFrame.from_dict(averages)