Visualising crystal space#
(prerequisite: crystal_space.ipynb)
In this tutorial, we will use the dimension reduction techniques to visualise a large crystal space. We will use the following techniques:
Principal Component Analysis (PCA)
t-distributed Stochastic Neighbor Embedding (t-SNE)
Uniform Manifold Approximation and Projection (UMAP)
we will make composional embedding created by ElementEmbeddings, as follows:
Magpie
Mat2Vec
Megnet16
Skipatom
Oliynyk
random_200
1. Element embeddings#
To begin, we will make compositional embedding created by element embeddings using the ElementEmbeddings package.
# Install the required packages
try:
import google.colab
IN_COLAB = True
except:
IN_COLAB = False
if IN_COLAB:
!uv pip install smact[crystal_space] --quiet
!uv pip install "plotly>=6.1.1" kaleido
!plotly_get_chrome -y
from collections.abc import Iterable
from pathlib import Path
from tqdm import tqdm
import numpy as np
import pandas as pd
from elementembeddings.composition import CompositionalEmbedding
embedding_names = [
"magpie",
"mat2vec",
"megnet16",
"skipatom",
"oliynyk",
"random_200",
]
reducers = ["pca", "tsne", "umap"]
# set save directory
save_dir = Path("data/binary/")
save_dir.mkdir(parents=True, exist_ok=True)
df_category = pd.read_pickle(save_dir / "df_binary_category.pkl")
# sampling
n_samples = 100 # 3000 for "Mapping inorganic crystal chemical space" paper
dict_label = {
"standard": 0,
"missing": 1,
"interesting": 2,
"unlikely": 3,
}
labels = ["standard", "missing", "interesting", "unlikely"]
list_df_sample = []
for label in labels:
m = df_category["label"] == label
df = df_category[m].sample(
n=min(n_samples, len(df_category[m])),
random_state=42,
)
list_df_sample.append(df)
df_sample = pd.concat(list_df_sample)
# save sampled data
df_sample.to_pickle(save_dir / "df_binary_sample.pkl")
def get_embedding(formula, embedding="magpie", stats="mean"):
"""
Computes a compositional embedding for a given chemical formula or a list of chemical formulas.
Parameters:
-----------
formula : str or iterable
embedding : str, optional
The type of embedding to compute. Must be one of ['magpie', 'mat2vec', 'megnet16', 'skipatom', 'oliynyk', 'random_200'].
Default is 'magpie'.
stats : str, optional
The type of statistics to compute for the embedding. Must be one of
["mean", "variance", "minpool", "maxpool", "range", "sum", "geometric_mean", "harmonic_mean"].
Default is 'mean'.
Returns:
--------
numpy.ndarray
1D array when formula is a string, 2D array when formula is a list of strings.
"""
if isinstance(formula, str):
formula = [formula]
elif isinstance(formula, Iterable):
pass
else:
raise TypeError("formula must be a string or a list of strings")
# get embedding dimension
embedding_dim = CompositionalEmbedding("", embedding=embedding).embedding.dim
# compute embedding
embeddings = []
for f in tqdm(formula):
try:
compositional_embedding = CompositionalEmbedding(f, embedding=embedding)
embeddings.append(compositional_embedding.feature_vector(stats=stats))
except Exception as e:
# the exception is raised when the embedding doesn't support the element
embeddings.append(np.full(embedding_dim, np.nan))
# concatenate the embedded vectors
embeddings = np.stack(embeddings, axis=0).squeeze()
return embeddings
# make the directory to save the embeddings
(save_dir / "embeddings").mkdir(parents=True, exist_ok=True)
# save the embeddings
for name in embedding_names:
print(f"Computing {name} embeddings")
embeddings = get_embedding(df_sample.index, embedding=name)
df_embeddings = pd.DataFrame(embeddings, index=df_sample.index)
df_embeddings = df_embeddings.dropna(axis=0)
df_embeddings.to_pickle(save_dir / "embeddings" / f"embeddings_{name}.pkl")
print(
f"Saved {name} embeddings with shape {df_embeddings.shape} to {save_dir / 'embeddings' / f'embeddings_{name}.pkl'}"
)
2. Dimension reduction#
Next, we will use the dimension reduction techniques to reduce the dimension of the data.
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from umap import UMAP
def dimension_reduction(
embeddings,
reducer="pca",
n_components=2,
save_dir=None,
file_name=None,
**kwargs,
):
"""
Performs dimensionality reduction on the given embeddings.
Parameters:
-----------
embeddings : pandas.DataFrame
The embeddings to reduce.
reducer : str, optional
The dimensionality reduction algorithm to use. Must be one of ['pca', 'tsne', 'umap'].
Default is 'pca'.
n_components : int, optional
The number of components to reduce to. Default is 2.
save_dir : str, optional
The directory to save the reduced embeddings. Default is None.
file_name : str, optional
The file name to save the reduced embeddings. Default is None.
**kwargs : dict, optional
Returns:
--------
numpy.ndarray
The reduced embeddings.
"""
if reducer == "pca":
reducer = PCA(n_components=n_components, **kwargs)
elif reducer == "tsne":
reducer = TSNE(n_components=n_components, **kwargs)
elif reducer == "umap":
reducer = UMAP(n_components=n_components, **kwargs)
else:
raise ValueError("reducer must be one of ['pca', 'tsne', 'umap']")
reduced_embeddings = reducer.fit_transform(embeddings.values)
if save_dir is not None:
save_dir = Path(save_dir)
save_dir.mkdir(exist_ok=True)
if file_name is None:
file_name = f"reduced_embeddings_{reducer.__class__.__name__}.pkl"
else:
file_name = f"{file_name}.pkl"
pd.DataFrame(reduced_embeddings, index=embeddings.index).to_pickle(save_dir / file_name)
print(f"Saved reduced embeddings to {save_dir / file_name}")
return reduced_embeddings
# make the directory to save the reduced embeddings
(save_dir / "reduced_embeddings_2d").mkdir(parents=True, exist_ok=True)
# calculate the reduced embeddings
silhouette_scores = {}
for name in embedding_names:
for reducer in reducers:
print(f"Computing {name} {reducer} embeddings")
embeddings = pd.read_pickle(save_dir / "embeddings" / f"embeddings_{name}.pkl")
reduced_embeddings = dimension_reduction(
embeddings,
reducer=reducer,
n_components=2,
save_dir=save_dir / "reduced_embeddings_2d",
file_name=f"{reducer}_{name}",
random_state=42,
)
3. Visualisation of the low dimensional embeddings#
from smact.utils.crystal_space.plot_embedding import plot_reducers_embeddings
df_category = pd.read_pickle(save_dir / "df_binary_category.pkl")
df_category["formula"] = df_category.index
embedding_dir = Path(save_dir / "reduced_embeddings_2d/")
save_path = save_dir / "plot_binary.jpg" # save path for the plot
fig = plot_reducers_embeddings(
df_category,
reducers,
embedding_names,
embedding_dir,
symbol="circle",
title="Compositional space for binary compounds",
save_path=save_path,
)
fig