Source code for smact.data_loader

"""
Provide data from text files while transparently caching for efficiency.

This module handles the loading of external data used to initialise the
core smact.Element and smact.Species classes.  It implements a
transparent data-caching system to avoid a large amount of I/O when
naively constructing several of these objects.  It also implements a
switchable system to print verbose warning messages about possible
missing data (mainly for debugging purposes). In general these fuctions
are used in the background and it is not necessary to use them directly.
"""

import csv
import os

from smact import data_directory

# Module-level switch: print "verbose" warning messages
# about missing data.
_print_warnings = False


[docs]def set_warnings(enable=True):
    """Set verbose warning messages on and off.

    In order to see any of the warnings, this function needs to be
    called _before_ the first call to the smact.Element()
    constructor.

    Args:
    enable (bool) : print verbose warning messages.
    """

    global _print_warnings
    _print_warnings = enable


def _get_data_rows(filename):
    """Generator for datafile entries by row"""
    with open(filename) as file:
        for line in file:
            line = line.strip()
            if line[0] != "#":
                yield line.split()


[docs]def float_or_None(x):
    """Cast a string to a float or to a None."""
    try:
        return float(x)
    except ValueError:
        return None


# Loader and cache for the element oxidation-state data.
_el_ox_states = None


[docs]def lookup_element_oxidation_states(symbol, copy=True):
    """
    Retrieve a list of known oxidation states for an element.
    The oxidation states list used is the SMACT default and
    most exhaustive list.

    Args:
        symbol (str) : the atomic symbol of the element to look up.
        copy (Optional(bool)): if True (default), return a copy of the
            oxidation-state list, rather than a reference to the cached
            data -- only use copy=False in performance-sensitive code
            and where the list will not be modified!

    Returns:
        list: List of known oxidation states for the element.

            Returns None if oxidation states for the Element were not
            found in the external data.
    """

    global _el_ox_states

    if _el_ox_states is None:
        _el_ox_states = {}

        for items in _get_data_rows(
            os.path.join(data_directory, "oxidation_states.txt")
        ):
            _el_ox_states[items[0]] = [
                int(oxidationState) for oxidationState in items[1:]
            ]

    if symbol in _el_ox_states:
        if copy:
            # _el_ox_states stores lists -> if copy is set, make an implicit
            # deep copy.  The elements of the lists are integers, which are
            # "value types" in Python.

            return [oxidationState for oxidationState in _el_ox_states[symbol]]
        else:
            return _el_ox_states[symbol]
    else:
        if _print_warnings:
            print(
                "WARNING: Oxidation states for element {} "
                "not found.".format(symbol)
            )
        return None


_el_ox_states_icsd = None


[docs]def lookup_element_oxidation_states_icsd(symbol, copy=True):
    """
    Retrieve a list of known oxidation states for an element.
    The oxidation states list used contains only those found
    in the ICSD (and judged to be non-spurious).

    Args:
        symbol (str) : the atomic symbol of the element to look up.
        copy (Optional(bool)): if True (default), return a copy of the
            oxidation-state list, rather than a reference to the cached
            data -- only use copy=False in performance-sensitive code
            and where the list will not be modified!

    Returns:
        list: List of known oxidation states for the element.

            Return None if oxidation states for the Element were not
            found in the external data.
    """

    global _el_ox_states_icsd

    if _el_ox_states_icsd is None:
        _el_ox_states_icsd = {}

        for items in _get_data_rows(
            os.path.join(data_directory, "oxidation_states_icsd.txt")
        ):
            _el_ox_states_icsd[items[0]] = [
                int(oxidationState) for oxidationState in items[1:]
            ]
    if symbol in _el_ox_states_icsd:
        if copy:
            # _el_ox_states_icsd stores lists -> if copy is set, make an implicit
            # deep copy. The elements of the lists are integers, which are
            # "value types" in Python.
            return [
                oxidationState for oxidationState in _el_ox_states_icsd[symbol]
            ]
        else:
            return _el_ox_states_icsd[symbol]
    else:
        if _print_warnings:
            print(
                "WARNING: Oxidation states for element {}"
                "not found.".format(symbol)
            )
        return None


_el_ox_states_sp = None


[docs]def lookup_element_oxidation_states_sp(symbol, copy=True):
    """
    Retrieve a list of known oxidation states for an element.
    The oxidation states list used contains only those that
    are in the Pymatgen default lambda table for structure prediction.

    Args:
        symbol (str) : the atomic symbol of the element to look up.
        copy (Optional(bool)): if True (default), return a copy of the
            oxidation-state list, rather than a reference to the cached
            data -- only use copy=False in performance-sensitive code
            and where the list will not be modified!

    Returns:
        list: List of known oxidation states for the element.

            Return None if oxidation states for the Element were not
            found in the external data.
    """

    global _el_ox_states_sp

    if _el_ox_states_sp is None:
        _el_ox_states_sp = {}

        for items in _get_data_rows(
            os.path.join(data_directory, "oxidation_states_SP.txt")
        ):
            _el_ox_states_sp[items[0]] = [
                int(oxidationState) for oxidationState in items[1:]
            ]

    if symbol in _el_ox_states_sp:
        if copy:
            # _el_ox_states_sp stores lists -> if copy is set, make an implicit
            # deep copy.  The elements of the lists are integers, which are
            # "value types" in Python.

            return [
                oxidationState for oxidationState in _el_ox_states_sp[symbol]
            ]
        else:
            return _el_ox_states_sp[symbol]
    else:
        if _print_warnings:
            print(
                "WARNING: Oxidation states for element {} "
                "not found.".format(symbol)
            )
        return None


_el_ox_states_wiki = None


[docs]def lookup_element_oxidation_states_wiki(symbol, copy=True):
    """
    Retrieve a list of known oxidation states for an element.
    The oxidation states list used contains only those that
    are on Wikipedia (https://en.wikipedia.org/wiki/Template:List_of_oxidation_states_of_the_elements).

    Args:
        symbol (str) : the atomic symbol of the element to look up.
        copy (Optional(bool)): if True (default), return a copy of the
            oxidation-state list, rather than a reference to the cached
            data -- only use copy=False in performance-sensitive code
            and where the list will not be modified!

    Returns:
        list: List of known oxidation states for the element.

            Return None if oxidation states for the Element were not
            found in the external data.
    """

    global _el_ox_states_wiki

    if _el_ox_states_wiki is None:
        _el_ox_states_wiki = {}

        for items in _get_data_rows(
            os.path.join(data_directory, "oxidation_states_wiki.txt")
        ):
            _el_ox_states_wiki[items[0]] = [
                int(oxidationState) for oxidationState in items[1:]
            ]

    if symbol in _el_ox_states_wiki:
        if copy:
            # _el_ox_states_wiki stores lists -> if copy is set, make an implicit
            # deep copy.  The elements of the lists are integers, which are
            # "value types" in Python.

            return [
                oxidationState for oxidationState in _el_ox_states_wiki[symbol]
            ]
        else:
            return _el_ox_states_wiki[symbol]
    else:
        if _print_warnings:
            print(
                "WARNING: Oxidation states for element {} "
                "not found.".format(symbol)
            )
        return None


_el_ox_states_custom = None


[docs]def lookup_element_oxidation_states_custom(symbol, filepath, copy=True):
    """
    Retrieve a list of known oxidation states for an element.
    The oxidation states list is specified by the user in a text file.

    Args:
        symbol (str) : the atomic symbol of the element to look up.
        copy (Optional(bool)): if True (default), return a copy of the
            oxidation-state list, rather than a reference to the cached
            data -- only use copy=False in performance-sensitive code
            and where the list will not be modified!

    Returns:
        list: List of known oxidation states for the element.

            Return None if oxidation states for the Element were not
            found in the external data.
    """

    global _el_ox_states_custom

    if _el_ox_states_custom is None:
        _el_ox_states_custom = {}

        for items in _get_data_rows(filepath):
            _el_ox_states_custom[items[0]] = [
                int(oxidationState) for oxidationState in items[1:]
            ]

    if symbol in _el_ox_states_custom:
        if copy:
            # _el_ox_states_custom stores lists -> if copy is set, make an implicit
            # deep copy.  The elements of the lists are integers, which are
            # "value types" in Python.

            return [
                oxidationState
                for oxidationState in _el_ox_states_custom[symbol]
            ]
        else:
            return _el_ox_states_custom[symbol]
    else:
        if _print_warnings:
            print(
                "WARNING: Oxidation states for element {} "
                "not found.".format(symbol)
            )
        return None


# Loader and cache for the element HHI scores.

_element_hhis = None


[docs]def lookup_element_hhis(symbol):
    """
    Retrieve the HHI_R and HHI_p scores for an element.

    Args:
        symbol : the atomic symbol of the element to look up.

    Returns:
        tuple : (HHI_p, HHI_R)

            Return None if values for the elements were
            not found in the external data.
    """

    global _element_hhis

    if _element_hhis is None:
        _element_hhis = {}

        with open(os.path.join(data_directory, "HHIs.txt")) as file:
            for line in file:
                line = line.strip()

                if line[0] != "#":
                    items = line.split()

                    _element_hhis[items[0]] = (
                        float(items[1]),
                        float(items[2]),
                    )

    if symbol in _element_hhis:
        return _element_hhis[symbol]
    else:
        if _print_warnings:
            print(
                "WARNING: HHI data for element " "{} not found.".format(symbol)
            )

        return None


# Loader and cache for elemental data

_element_data = None


[docs]def lookup_element_data(symbol, copy=True):
    """
    Retrieve tabulated data for an element.

    The table "data/element_data.txt" contains a collection of relevant
    atomic data. If a cache exists in the form of the module-level
    variable _element_data, this is returned. Otherwise, a dictionary is
    constructed from the data table and cached before returning it.

    Args:
        symbol (str) : Atomic symbol for lookup

        copy (Optional(bool)) : if True (default), return a copy of the
            data dictionary, rather than a reference to the cached
            object -- only used copy=False in performance-sensitive code
            and where you are certain the dictionary will not be
            modified!

    Returns (dict) : Dictionary of data for given element, keyed by
        column headings from data/element_data.txt.
    """
    global _element_data
    if _element_data is None:
        _element_data = {}
        keys = (
            "Symbol",
            "Name",
            "Z",
            "Mass",
            "r_cov",
            "e_affinity",
            "p_eig",
            "s_eig",
            "Abundance",
            "el_neg",
            "ion_pot",
            "dipol",
        )
        for items in _get_data_rows(
            os.path.join(data_directory, "element_data.txt")
        ):
            # First two columns are strings and should be left intact
            # Everything else is numerical and should be cast to a float
            # or, if not clearly a number, to None
            clean_items = items[0:2] + list(map(float_or_None, items[2:]))

            _element_data.update({items[0]: dict(list(zip(keys, clean_items)))})

    if symbol in _element_data:
        if copy:
            # _element_open_babel_derived_data stores dictionaries
            # -> if copy is set, use the dict.copy() function to return
            # a copy. The values are all Python "value types", so
            # explicitly cloning the elements is not necessary to make
            # a deep copy.

            return _element_data[symbol].copy()
        else:
            return _element_data[symbol]
    else:
        if _print_warnings:
            print("WARNING: Elemental data for {}" " not found.".format(symbol))
            print(_element_data)
        return None


# Loader and cache for the element Shannon radii datasets.

_element_shannon_radii_data = None


[docs]def lookup_element_shannon_radius_data(symbol, copy=True):
    """
    Retrieve Shannon radii for known states of an element.

    Retrieve Shannon radii for known oxidation states and coordination
    environments of an element.

    Args:
        symbol (str) : the atomic symbol of the element to look up.

        copy (Optional(bool)): if True (default), return a copy of the data
        dictionary, rather than a reference to the cached object --
        only use copy=False in performance-sensitive code and where
        you are certain the dictionary will not be modified!

    Returns:
        list:
            Shannon radii datasets.

        Returns None if the element was not found among the external
        data.

        Shannon radii datasets are dictionaries with the keys:

        charge
            *int* charge
        coordination
            *int* coordination
        crystal_radius
            *float*
        ionic_radius
            *float*
        comment
            *str*
    """

    global _element_shannon_radii_data

    if _element_shannon_radii_data is None:
        _element_shannon_radii_data = {}

        with open(os.path.join(data_directory, "shannon_radii.csv")) as file:
            reader = csv.reader(file)

            # Skip the first row (headers).

            next(reader)

            for row in reader:
                # For the shannon radii, there are multiple datasets for
                # different element/oxidation-state/coordination
                # combinations.

                key = row[0]

                dataset = {
                    "charge": int(row[1]),
                    "coordination": row[2],
                    "crystal_radius": float(row[3]),
                    "ionic_radius": float(row[4]),
                    "comment": row[5],
                }

                if key in _element_shannon_radii_data:
                    _element_shannon_radii_data[key].append(dataset)
                else:
                    _element_shannon_radii_data[key] = [dataset]

    if symbol in _element_shannon_radii_data:
        if copy:
            # _element_shannon_radii_data stores a list of dictionaries
            # -> if copy is set, copy the list and use the dict.copy()
            # function on each element.
            # The dictionary values are all Python "value types", so
            # nothing further is required to make a deep copy.
            return [item.copy() for item in _element_shannon_radii_data[symbol]]
        else:
            return _element_shannon_radii_data[symbol]
    else:
        if _print_warnings:
            print(
                "WARNING: Shannon-radius data for element {} not "
                "found.".format(symbol)
            )

        return None


# Loader and cache for the machine-learned extended element Shannon radii datasets.

_element_shannon_radii_data_extendedML = None


[docs]def lookup_element_shannon_radius_data_extendedML(symbol, copy=True):
    """
    Retrieve the machine learned extended Shannon radii for
    known states of an element.


    Retrieve Shannon radii for known oxidation states and coordination
    environments of an element.

    Source of extended radii is:
    Baloch, A.A., Alqahtani, S.M., Mumtaz, F., Muqaibel, A.H., Rashkeev,
    S.N. and Alharbi, F.H., 2021.
    Extending Shannon's Ionic Radii Database Using Machine Learning.
    arXiv preprint arXiv:2101.00269.

    Args:
        symbol (str) : the atomic symbol of the element to look up.

        copy (Optional(bool)): if True (default), return a copy of the data
        dictionary, rather than a reference to the cached object --
        only use copy=False in performance-sensitive code and where
        you are certain the dictionary will not be modified!

    Returns:
        list:
            Extended Shannon radii datasets.

        Returns None if the element was not found among the external
        data.

        Shannon radii datasets are dictionaries with the keys:

        charge
            *int* charge
        coordination
            *int* coordination
        ionic_radius
            *float*
        comment
            *str*
    """

    global _element_shannon_radii_data_extendedML

    if _element_shannon_radii_data_extendedML is None:
        _element_shannon_radii_data_extendedML = {}

        with open(
            os.path.join(data_directory, "shannon_radii_ML_extended.csv")
        ) as file:
            reader = csv.reader(file)

            # Skip the first row (headers).

            next(reader)

            for row in reader:
                # For the shannon radii, there are multiple datasets for
                # different element/oxidation-state/coordination
                # combinations.

                key = row[0]

                dataset = {
                    "charge": int(row[1]),
                    "coordination": row[2],
                    "crystal_radius": float(row[3]),
                    "ionic_radius": float(row[4]),
                    "comment": row[5],
                }

                if key in _element_shannon_radii_data_extendedML:
                    _element_shannon_radii_data_extendedML[key].append(dataset)
                else:
                    _element_shannon_radii_data_extendedML[key] = [dataset]

    if symbol in _element_shannon_radii_data_extendedML:
        if copy:
            # _element_shannon_radii_data_extendedML stores a list of dictionaries
            # -> if copy is set, copy the list and use the dict.copy()
            # function on each element.
            # The dictionary values are all Python "value types", so
            # nothing further is required to make a deep copy.
            return [
                item.copy()
                for item in _element_shannon_radii_data_extendedML[symbol]
            ]
        else:
            return _element_shannon_radii_data_extendedML[symbol]
    else:
        if _print_warnings:
            print(
                "WARNING: Extended Shannon-radius data for element {} not "
                "found.".format(symbol)
            )

        return None


# Loader and cache for the element solid-state energy (SSE) datasets.

_element_ssedata = None


[docs]def lookup_element_sse_data(symbol):
    """
    Retrieve the solid-state energy (SSE) data for an element.

    Taken from J. Am. Chem. Soc., 2011, 133 (42), pp 16852-16960,
    DOI: 10.1021/ja204670s

    Args:
        symbol : the atomic symbol of the element to look up.

    Returns:
        list : SSE datasets for the element, or None
            if the element was not found among the external data.

        SSE datasets are dictionaries with the keys:

        AtomicNumber
            *int*
        SolidStateEnergy
            *float* SSE
        IonisationPotential
            *float*
        ElectronAffinity
            *float*
        MullikenElectronegativity
            *str*
        SolidStateRenormalisationEnergy
            *float*
    """

    global _element_ssedata

    if _element_ssedata is None:
        _element_ssedata = {}

        with open(os.path.join(data_directory, "SSE.csv")) as file:
            reader = csv.reader(file)

            for row in reader:
                dataset = {
                    "AtomicNumber": int(row[1]),
                    "SolidStateEnergy": float(row[2]),
                    "IonisationPotential": float(row[3]),
                    "ElectronAffinity": float(row[4]),
                    "MullikenElectronegativity": float(row[5]),
                    "SolidStateRenormalisationEnergy": float(row[6]),
                }

                _element_ssedata[row[0]] = dataset

    if symbol in _element_ssedata:
        return _element_ssedata[symbol]
    else:
        if _print_warnings:
            print(
                "WARNING: Solid-state energy data for element {} not"
                " found.".format(symbol)
            )

        return None


# Loader and cache for the revised (2015) element solid-state energy
# (SSE) datasets.

_element_sse2015_data = None


[docs]def lookup_element_sse2015_data(symbol, copy=True):
    """
    Retrieve SSE (2015) data for element in oxidation state.

    Retrieve the solid-state energy (SSE2015) data for an element in an
    oxidation state.  Taken from J. Solid State Chem., 2015, 231,
    pp138-144, DOI: 10.1016/j.jssc.2015.07.037.

    Args:
        symbol : the atomic symbol of the element to look up.
        copy: if True (default), return a copy of the data dictionary,
        rather than a reference to a cached object -- only use
        copy=False in performance-sensitive code and where you are
        certain the dictionary will not be modified!

    Returns:
        list : SSE datasets for the element, or None
            if the element was not found among the external data.

        SSE datasets are dictionaries with the keys:

        OxidationState
            *int*
        SolidStateEnergy2015
            *float* SSE2015
    """

    global _element_sse2015_data

    if _element_sse2015_data is None:
        _element_sse2015_data = {}

        with open(os.path.join(data_directory, "SSE_2015.csv")) as file:
            reader = csv.reader(file)

            for row in reader:
                # Elements can have multiple SSE values depending on
                # their oxidation state

                key = row[0]

                dataset = {
                    "OxidationState": int(row[1]),
                    "SolidStateEnergy2015": float(row[2]),
                }

                if key in _element_sse2015_data:
                    _element_sse2015_data[key].append(dataset)
                else:
                    _element_sse2015_data[key] = [dataset]

    if symbol in _element_sse2015_data:
        if copy:
            return [item.copy() for item in _element_sse2015_data[symbol]]
        else:
            return _element_sse2015_data[symbol]
    else:
        if _print_warnings:
            print(
                "WARNING: Solid-state energy (revised 2015) data for "
                "element {} not found.".format(symbol)
            )

        return None


# Loader and cache for the element solid-state energy (SSE) from Pauling
# electronegativity datasets.

_element_ssepauling_data = None


[docs]def lookup_element_sse_pauling_data(symbol):
    """Retrieve Pauling SSE data

    Retrieve the solid-state energy (SSEPauling) data for an element
    from the regression fit when SSE2015 is plotted against Pauling
    electronegativity.  Taken from J. Solid State Chem., 2015, 231,
    pp138-144, DOI: 10.1016/j.jssc.2015.07.037

    Args:
    symbol (str) : the atomic symbol of the element to look up.

    Returns: A dictionary containing the SSE2015 dataset for the
        element, or None if the element was not found among the external
        data.
    """

    global _element_ssepauling_data

    if _element_ssepauling_data is None:
        _element_ssepauling_data = {}

        with open(os.path.join(data_directory, "SSE_Pauling.csv")) as file:
            reader = csv.reader(file)

            for row in reader:
                dataset = {"SolidStateEnergyPauling": float(row[1])}

                _element_ssepauling_data[row[0]] = dataset

    if symbol in _element_ssepauling_data:
        return _element_ssepauling_data[symbol]
    else:
        if _print_warnings:
            print(
                "WARNING: Solid-state energy data from Pauling "
                " electronegativity regression fit for "
                " element {} not found.".format(symbol)
            )

        return None