Source code for smact.utils.oxidation

"""Utility functions for handling oxidation states."""

from __future__ import annotations

from pathlib import Path
from typing import cast

import pandas as pd

from smact import Element, data_directory, ordered_elements
from smact.utils.species import unparse_spec



[docs]
class ICSD24OxStatesFilter:
    """Class to handle filtering the ICSD 24 oxidation states list.

    The ICSD 24 oxidation states list is a list of oxidation states for each element in the ICSD 24 database.

    Attributes:
        ox_states_df (pd.DataFrame): The ICSD 24 oxidation states list as a DataFrame.
    """

    def __init__(self) -> None:
        """Initialise the ICSD 24 oxidation states list."""
        self.ox_states_df = pd.read_json(str(Path(data_directory) / "oxidation_states_icsd24_counts.json"))


[docs]
    def filter(
        self,
        consensus: int = 3,
        include_zero: bool = False,
        commonality: str | float = "low",
    ) -> pd.DataFrame:
        """Filter the ICSD 24 oxidation states list by a threshold.

        Args:
            consensus (int): Minimum number of occurrences in
                literature for an ion to be considered valid.
                Default is 3.
            include_zero (bool): Include oxidation state of zero in the
                filtered list. Default is False.
            commonality (str): Excludes species below a certain
                proportion of appearances in literature with respect
                to the total number of reports of a given element
                (after the consensus threshold has been applied).
                "low" includes all species, "medium" excludes rare
                species below 10% occurrence, and "high" excludes
                non-majority species below 50% occurrence. "main"
                selects the species with the highest occurrence for
                a given element. Users may also specify their own
                threshold (float or int). Default is "low".

        Returns:
            pd.DataFrame: The filtered oxidation states list as a DataFrame.
        """
        commonality_map = {"low": 0, "medium": 10, "high": 50}
        commonality_threshold = 0

        if isinstance(commonality, str):
            if commonality == "main":
                pass
            else:
                threshold = commonality_map.get(commonality)
                if threshold is None:
                    msg = f"Unrecognised commonality string {commonality!r}. Use 'low', 'medium', 'high', or 'main'."
                    raise ValueError(msg)
                commonality_threshold = threshold
        elif isinstance(commonality, int | float):
            commonality_threshold = commonality
        else:
            msg = "commonality must be a string ('low', 'medium', 'high', 'main'), a float or an integer"
            raise TypeError(msg)

        if not include_zero:
            filtered_df = self.ox_states_df[self.ox_states_df["oxidation_state"] != 0].reset_index(drop=True)
        else:
            filtered_df = self.ox_states_df

        filtered_df = cast(
            "pd.DataFrame",
            filtered_df[(filtered_df["results_count"] >= consensus) & (filtered_df["results_count"] != 0)],
        ).reset_index(drop=True)

        element_totals = filtered_df.groupby("element")["results_count"].transform("sum")
        filtered_df["species_proportion (%)"] = filtered_df["results_count"] / element_totals * 100

        if commonality == "main":
            idx = filtered_df.groupby("element")["species_proportion (%)"].idxmax()
            filtered_df = cast("pd.DataFrame", filtered_df.loc[idx]).reset_index(drop=True)
        else:
            filtered_df = cast(
                "pd.DataFrame",
                filtered_df[filtered_df["species_proportion (%)"] >= commonality_threshold],
            ).reset_index(drop=True)

        summary_df = (
            filtered_df.groupby("element").apply(self._filter_oxidation_states, 0, include_groups=False).reset_index()
        )
        summary_df.columns = ["element", "oxidation_state"]
        summary_df["atomic_number"] = summary_df["element"].apply(lambda x: Element(x).number)
        return summary_df.sort_values("atomic_number").drop(columns="atomic_number").reset_index(drop=True)



[docs]
    def get_species_list(
        self,
        consensus: int = 3,
        include_zero: bool = False,
        include_one_oxidation_state: bool = False,
        commonality: str | float = "low",
    ) -> list[str]:
        """Get the filtered ICSD 24 oxidation states list as a list of species.

        Args:
            consensus (int): Minimum number of occurrences in
                literature for an ion to be considered valid.
                Default is 3.
            include_zero (bool): Include oxidation state of zero in
                the filtered list. Default is False.
            include_one_oxidation_state (bool): Include oxidation
                states +1 and -1 in the filtered list or include
                as + and - signs. Default is False.
            commonality (str): Excludes species below a certain
                proportion of appearances in literature with respect
                to the total number of reports of a given element
                (after the consensus threshold has been applied).
                "low" includes all species, "medium" excludes rare
                species below 10% occurrence, and "high" excludes
                non-majority species below 50% occurrence. "main"
                selects the species with the highest occurrence for
                a given element. Users may also specify their own
                threshold (float or int). Default is "low".

        Returns:
            list: The filtered oxidation states list as a list of species.
        """
        filtered_df = self.filter(consensus, include_zero, commonality)
        species_list = []
        for _, row in filtered_df.iterrows():
            ox_states = str(row["oxidation_state"]).split(" ")
            for ox_state in ox_states:
                try:
                    species_list.append(
                        unparse_spec(
                            (str(row["element"]), int(float(ox_state))),
                            include_one=include_one_oxidation_state,
                        )
                    )
                except ValueError:
                    continue
        return species_list



[docs]
    def get_species_occurrences_df(
        self,
        consensus: int = 3,
        include_one_oxidation_state: bool = False,
        sort_by_occurrences: bool = True,
        include_zero: bool = False,
    ) -> pd.DataFrame:
        """Get the ICSD 24 oxidation states list as a dataframe of species with their occurrences.

        Args:
            consensus (int): Minimum number of occurrences in
                literature for an ion to be considered valid.
                Default is 3.
            include_one_oxidation_state (bool): Include oxidation
                states +1 and -1 in the species or include as + and
                - signs. Default is False.
            sort_by_occurrences (bool): Sort the species list by
                occurrences. Default is True.
            include_zero (bool): Include oxidation state of zero in
                the filtered list. Default is False.

        Returns:
            dataframe: The species list as a dataframe of species with their occurrences.
        """
        if not include_zero:
            species_occurrences_df = self.ox_states_df[self.ox_states_df["oxidation_state"] != 0].reset_index(drop=True)
        else:
            species_occurrences_df = self.ox_states_df

        species_occurrences_df = cast(
            "pd.DataFrame",
            species_occurrences_df[species_occurrences_df.results_count >= consensus],
        ).reset_index(drop=True)
        species_occurrences_df["species"] = species_occurrences_df.apply(
            lambda x: unparse_spec(
                (x["element"], x["oxidation_state"]),
                include_one=include_one_oxidation_state,
            ),
            axis=1,
        )
        species_occurrences_df = species_occurrences_df[["element", "species", "results_count"]]
        grouped = species_occurrences_df.groupby("element")["results_count"]
        element_totals = grouped.transform("sum")
        species_occurrences_df["species_proportion (%)"] = (
            species_occurrences_df["results_count"] / element_totals * 100
        )
        if sort_by_occurrences:
            sorted_df = species_occurrences_df.sort_values(  # type: ignore[call-overload]  # pandas stubs overload
                by="results_count", ascending=False
            )
            return cast("pd.DataFrame", sorted_df).reset_index(drop=True)
        return cast("pd.DataFrame", species_occurrences_df)



[docs]
    def write(
        self,
        filename: str,
        comment: str | None = None,
        consensus: int = 3,
        include_zero: bool = False,
        commonality: str = "low",
    ) -> None:
        """Write the filtered ICSD 24 oxidation states list to a SMACT-compatible oxidation states txt file.

        Args:
            filename (str): The filename to write the filtered oxidation states list to.
            comment (str): A comment to include in the txt file. Default is None.
            consensus (int): Minimum number of occurrences in
                literature for an ion to be considered valid.
                Default is 3.
            include_zero (bool): Include oxidation state of zero in
                the filtered list. Default is False.
            commonality (str): Excludes species below a certain
                proportion of appearances in literature with respect
                to the total number of reports of a given element
                (after the consensus threshold has been applied).
                "low" includes all species, "medium" excludes rare
                species below 10% occurrence, and "high" excludes
                non-majority species below 50% occurrence. "main"
                selects the species with the highest occurrence for
                a given element. Users may also specify their own
                threshold (float or int). Default is "low".
        """
        filtered_df = self.filter(consensus, include_zero, commonality)
        # Convert the DataFrame to the require format
        summary_dict = filtered_df.set_index("element")["oxidation_state"].to_dict()
        all_elements = ordered_elements(1, 103)
        final_summary = []

        for element in all_elements:
            oxidation_states = summary_dict.get(element, "")

            if oxidation_states:
                final_summary.append(f"{element} {oxidation_states}".strip())
            else:
                final_summary.append(element)
        # Write the filtered oxidation states list to a txt file
        if not filename.endswith(".txt"):
            filename += ".txt"
        with Path(filename).open("w") as f:
            f.write(
                f"#\n# Oxidation state set\n"
                f"# Source: ICSD (2024), filtered for"
                f" {commonality} commonality of reports\n#\n"
            )
            if comment:
                f.write(f"# {comment}\n#\n")
            if include_zero:
                f.write("# Includes oxidation state 0\n#\n")
            for line in final_summary:
                f.write(line + "\n")


    def _filter_oxidation_states(self, group: pd.DataFrame, threshold: int) -> str:
        """Filter the oxidation states list by a threshold."""
        filtered_states = group[group["results_count"] >= threshold]

        return " ".join(map(str, sorted(filtered_states["oxidation_state"])))