"""Utility functions for handling oxidation states."""
from __future__ import annotations
from pathlib import Path
from typing import cast
import pandas as pd
from smact import Element, data_directory, ordered_elements
from smact.utils.species import unparse_spec
[docs]
class ICSD24OxStatesFilter:
"""Class to handle filtering the ICSD 24 oxidation states list.
The ICSD 24 oxidation states list is a list of oxidation states for each element in the ICSD 24 database.
Attributes:
ox_states_df (pd.DataFrame): The ICSD 24 oxidation states list as a DataFrame.
"""
def __init__(self) -> None:
"""Initialise the ICSD 24 oxidation states list."""
self.ox_states_df = pd.read_json(str(Path(data_directory) / "oxidation_states_icsd24_counts.json"))
[docs]
def filter(
self,
consensus: int = 3,
include_zero: bool = False,
commonality: str | float = "low",
) -> pd.DataFrame:
"""Filter the ICSD 24 oxidation states list by a threshold.
Args:
consensus (int): Minimum number of occurrences in
literature for an ion to be considered valid.
Default is 3.
include_zero (bool): Include oxidation state of zero in the
filtered list. Default is False.
commonality (str): Excludes species below a certain
proportion of appearances in literature with respect
to the total number of reports of a given element
(after the consensus threshold has been applied).
"low" includes all species, "medium" excludes rare
species below 10% occurrence, and "high" excludes
non-majority species below 50% occurrence. "main"
selects the species with the highest occurrence for
a given element. Users may also specify their own
threshold (float or int). Default is "low".
Returns:
pd.DataFrame: The filtered oxidation states list as a DataFrame.
"""
commonality_map = {"low": 0, "medium": 10, "high": 50}
commonality_threshold = 0
if isinstance(commonality, str):
if commonality == "main":
pass
else:
threshold = commonality_map.get(commonality)
if threshold is None:
msg = f"Unrecognised commonality string {commonality!r}. Use 'low', 'medium', 'high', or 'main'."
raise ValueError(msg)
commonality_threshold = threshold
elif isinstance(commonality, int | float):
commonality_threshold = commonality
else:
msg = "commonality must be a string ('low', 'medium', 'high', 'main'), a float or an integer"
raise TypeError(msg)
if not include_zero:
filtered_df = self.ox_states_df[self.ox_states_df["oxidation_state"] != 0].reset_index(drop=True)
else:
filtered_df = self.ox_states_df
filtered_df = cast(
"pd.DataFrame",
filtered_df[(filtered_df["results_count"] >= consensus) & (filtered_df["results_count"] != 0)],
).reset_index(drop=True)
element_totals = filtered_df.groupby("element")["results_count"].transform("sum")
filtered_df["species_proportion (%)"] = filtered_df["results_count"] / element_totals * 100
if commonality == "main":
idx = filtered_df.groupby("element")["species_proportion (%)"].idxmax()
filtered_df = cast("pd.DataFrame", filtered_df.loc[idx]).reset_index(drop=True)
else:
filtered_df = cast(
"pd.DataFrame",
filtered_df[filtered_df["species_proportion (%)"] >= commonality_threshold],
).reset_index(drop=True)
summary_df = (
filtered_df.groupby("element").apply(self._filter_oxidation_states, 0, include_groups=False).reset_index()
)
summary_df.columns = ["element", "oxidation_state"]
summary_df["atomic_number"] = summary_df["element"].apply(lambda x: Element(x).number)
return summary_df.sort_values("atomic_number").drop(columns="atomic_number").reset_index(drop=True)
[docs]
def get_species_list(
self,
consensus: int = 3,
include_zero: bool = False,
include_one_oxidation_state: bool = False,
commonality: str | float = "low",
) -> list[str]:
"""Get the filtered ICSD 24 oxidation states list as a list of species.
Args:
consensus (int): Minimum number of occurrences in
literature for an ion to be considered valid.
Default is 3.
include_zero (bool): Include oxidation state of zero in
the filtered list. Default is False.
include_one_oxidation_state (bool): Include oxidation
states +1 and -1 in the filtered list or include
as + and - signs. Default is False.
commonality (str): Excludes species below a certain
proportion of appearances in literature with respect
to the total number of reports of a given element
(after the consensus threshold has been applied).
"low" includes all species, "medium" excludes rare
species below 10% occurrence, and "high" excludes
non-majority species below 50% occurrence. "main"
selects the species with the highest occurrence for
a given element. Users may also specify their own
threshold (float or int). Default is "low".
Returns:
list: The filtered oxidation states list as a list of species.
"""
filtered_df = self.filter(consensus, include_zero, commonality)
species_list = []
for _, row in filtered_df.iterrows():
ox_states = str(row["oxidation_state"]).split(" ")
for ox_state in ox_states:
try:
species_list.append(
unparse_spec(
(str(row["element"]), int(float(ox_state))),
include_one=include_one_oxidation_state,
)
)
except ValueError:
continue
return species_list
[docs]
def get_species_occurrences_df(
self,
consensus: int = 3,
include_one_oxidation_state: bool = False,
sort_by_occurrences: bool = True,
include_zero: bool = False,
) -> pd.DataFrame:
"""Get the ICSD 24 oxidation states list as a dataframe of species with their occurrences.
Args:
consensus (int): Minimum number of occurrences in
literature for an ion to be considered valid.
Default is 3.
include_one_oxidation_state (bool): Include oxidation
states +1 and -1 in the species or include as + and
- signs. Default is False.
sort_by_occurrences (bool): Sort the species list by
occurrences. Default is True.
include_zero (bool): Include oxidation state of zero in
the filtered list. Default is False.
Returns:
dataframe: The species list as a dataframe of species with their occurrences.
"""
if not include_zero:
species_occurrences_df = self.ox_states_df[self.ox_states_df["oxidation_state"] != 0].reset_index(drop=True)
else:
species_occurrences_df = self.ox_states_df
species_occurrences_df = cast(
"pd.DataFrame",
species_occurrences_df[species_occurrences_df.results_count >= consensus],
).reset_index(drop=True)
species_occurrences_df["species"] = species_occurrences_df.apply(
lambda x: unparse_spec(
(x["element"], x["oxidation_state"]),
include_one=include_one_oxidation_state,
),
axis=1,
)
species_occurrences_df = species_occurrences_df[["element", "species", "results_count"]]
grouped = species_occurrences_df.groupby("element")["results_count"]
element_totals = grouped.transform("sum")
species_occurrences_df["species_proportion (%)"] = (
species_occurrences_df["results_count"] / element_totals * 100
)
if sort_by_occurrences:
sorted_df = species_occurrences_df.sort_values( # type: ignore[call-overload] # pandas stubs overload
by="results_count", ascending=False
)
return cast("pd.DataFrame", sorted_df).reset_index(drop=True)
return cast("pd.DataFrame", species_occurrences_df)
[docs]
def write(
self,
filename: str,
comment: str | None = None,
consensus: int = 3,
include_zero: bool = False,
commonality: str = "low",
) -> None:
"""Write the filtered ICSD 24 oxidation states list to a SMACT-compatible oxidation states txt file.
Args:
filename (str): The filename to write the filtered oxidation states list to.
comment (str): A comment to include in the txt file. Default is None.
consensus (int): Minimum number of occurrences in
literature for an ion to be considered valid.
Default is 3.
include_zero (bool): Include oxidation state of zero in
the filtered list. Default is False.
commonality (str): Excludes species below a certain
proportion of appearances in literature with respect
to the total number of reports of a given element
(after the consensus threshold has been applied).
"low" includes all species, "medium" excludes rare
species below 10% occurrence, and "high" excludes
non-majority species below 50% occurrence. "main"
selects the species with the highest occurrence for
a given element. Users may also specify their own
threshold (float or int). Default is "low".
"""
filtered_df = self.filter(consensus, include_zero, commonality)
# Convert the DataFrame to the require format
summary_dict = filtered_df.set_index("element")["oxidation_state"].to_dict()
all_elements = ordered_elements(1, 103)
final_summary = []
for element in all_elements:
oxidation_states = summary_dict.get(element, "")
if oxidation_states:
final_summary.append(f"{element} {oxidation_states}".strip())
else:
final_summary.append(element)
# Write the filtered oxidation states list to a txt file
if not filename.endswith(".txt"):
filename += ".txt"
with Path(filename).open("w") as f:
f.write(
f"#\n# Oxidation state set\n"
f"# Source: ICSD (2024), filtered for"
f" {commonality} commonality of reports\n#\n"
)
if comment:
f.write(f"# {comment}\n#\n")
if include_zero:
f.write("# Includes oxidation state 0\n#\n")
for line in final_summary:
f.write(line + "\n")
def _filter_oxidation_states(self, group: pd.DataFrame, threshold: int) -> str:
"""Filter the oxidation states list by a threshold."""
filtered_states = group[group["results_count"] >= threshold]
return " ".join(map(str, sorted(filtered_states["oxidation_state"])))