Source code for lobsterpy.featurize.batch

# Copyright (c) lobsterpy development team
# Distributed under the terms of a BSD 3-Clause "New" or "Revised" License

"""This module defines wrapper classes to quickly obtain similarity matrix of input fingerprint objects."""

from __future__ import annotations

import multiprocessing as mp
import os
import warnings
from pathlib import Path
from typing import Literal

import numpy as np
import pandas as pd
from tqdm.autonotebook import tqdm

from lobsterpy.featurize.core import (
    CoxxFingerprint,
    FeaturizeCharges,
    FeaturizeCOXX,
    FeaturizeDoscar,
    FeaturizeIcoxxlist,
    FeaturizeLobsterpy,
)
from lobsterpy.featurize.utils import get_file_paths
from lobsterpy.structuregraph.graph import LobsterGraph

warnings.filterwarnings("ignore")


[docs] class BatchSummaryFeaturizer: """ Batch Featurizer sets that generates summary features from lobster data. :param path_to_lobster_calcs: path to root directory consisting of all lobster calc :param path_to_jsons: path to root directory consisting of all lobster lightweight jsons :param feature_type: set the feature type for moment features. Possible options are `bonding`, `antibonding` or `overall` :param charge_type: set charge type used for computing ionicity. Possible options are `mulliken`, `loewdin` or `both`. :param bonds: `all_bonds` or `cation_anion_bonds` :param orbital_resolved: bool indicating whether LobsterPy analysis is performed orbital wise :param include_cobi_data: bool stating to include COBICAR.lobster features :param include_coop_data: bool stating to include COOPCAR.lobster features :param e_range: range of energy relative to fermi for which moment features needs to be computed :param n_jobs: parallel processes to run """ def __init__( self, path_to_lobster_calcs: str | Path, path_to_jsons: str | Path | None = None, feature_type: Literal["bonding", "antibonding", "overall"] = "antibonding", charge_type: Literal["mulliken", "loewdin", "both"] = "both", bonds: Literal["all", "cation-anion"] = "all", orbital_resolved: bool = False, include_cobi_data: bool = False, include_coop_data: bool = False, e_range: list[float] = [-5.0, 0.0], n_jobs: int = 4, **analysis_kwargs, ): """ Featurize lobster data via multiprocessing for large number of compounds. :param path_to_lobster_calcs: path to root directory consisting of all lobster calc :param path_to_jsons: path to root directory consisting of all lobster lightweight jsons :param feature_type: set the feature type for moment features. Possible options are `bonding`, `antibonding` or `overall` :param charge_type: set charge type used for computing ionicity. Possible options are `mulliken`, `loewdin` or `both`. :param bonds: `all` or `cation-anion` bonds :param orbital_resolved: bool indicating whether LobsterPy analysis is performed orbital wise :param include_cobi_data: bool stating to include COBICAR.lobster features :param include_coop_data: bool stating to include COOPCAR.lobster features :param e_range: range of energy relative to fermi for which moment features needs to be computed :param n_jobs: parallel processes to run :param analysis_kwargs: keyword arguments for Analysis class of Lobsterpy """ # Check for valid parameters of string type allowed_str_inputs = { "charge_type": ["mulliken", "loewdin", "both"], "bonds": ["all", "cation-anion"], "feature_type": ["bonding", "antibonding", "overall"], } for param, param_string in zip([charge_type, bonds, feature_type], ["charge_type", "bonds", "feature_type"]): if param not in allowed_str_inputs[param_string]: raise ValueError( f"Parameter {param_string} set to {param} but must be in " f"{list(allowed_str_inputs[param_string])}." ) self.path_to_lobster_calcs = path_to_lobster_calcs self.path_to_jsons = path_to_jsons self.feature_type = feature_type self.charge_type = charge_type self.bonds = bonds self.orbital_resolved = orbital_resolved self.include_cobi_data = include_cobi_data self.include_coop_data = include_coop_data self.e_range = e_range self.n_jobs = n_jobs self.analysis_kwargs = analysis_kwargs def _featurizelobsterpy(self, file_name_or_path: str | Path) -> pd.DataFrame: """ Featurize Lobsterpy condensed bonding analysis data. if lightweight json file exists loads that or invokes LobsterPy Analysis class. :param file_name_or_path: path to the LOBSTER calc directory or lightweight condensed bonding analysis json file name. Returns: A pandas dataframe with ICOHP stats like mean, min, max of relevant bonds and madelung energies """ if Path(file_name_or_path).is_file(): featurize_lobsterpy = FeaturizeLobsterpy( path_to_json=file_name_or_path, bonds=self.bonds, ) else: featurize_lobsterpy = FeaturizeLobsterpy( path_to_lobster_calc=file_name_or_path, bonds=self.bonds, orbital_resolved=self.orbital_resolved, **self.analysis_kwargs, ) return featurize_lobsterpy.get_df() def _featurizecoxx(self, path_to_lobster_calc: str | Path) -> pd.DataFrame: """ Featurize COHP/COBI/COOPCAR data using FeaturizeCOXX. :param path_to_lobster_calc: path to root LOBSTER calc directory Returns: A pandas dataframe with COHP summary stats data mainly weighted ICOHP/ICOOP/ICOBI, Effective interaction number and moment features (center, width, skewness and kurtosis) """ file_paths = get_file_paths( path_to_lobster_calc=path_to_lobster_calc, requested_files=["structure", "cohpcar", "icohplist"] ) structure_path = file_paths.get("structure") coxx = FeaturizeCOXX( path_to_coxxcar=str(file_paths.get("cohpcar")), path_to_icoxxlist=str(file_paths.get("icohplist")), path_to_structure=str(structure_path), feature_type=self.feature_type, e_range=self.e_range, ) df = coxx.get_summarized_coxx_df() del coxx if self.include_cobi_data: file_paths = get_file_paths( path_to_lobster_calc=path_to_lobster_calc, requested_files=["cobicar", "icobilist"] ) coxx = FeaturizeCOXX( path_to_coxxcar=str(file_paths.get("cobicar")), path_to_icoxxlist=str(file_paths.get("icobilist")), path_to_structure=str(structure_path), feature_type=self.feature_type, e_range=self.e_range, are_cobis=True, ) df_cobi = coxx.get_summarized_coxx_df() df = pd.concat([df, df_cobi], axis=1) del coxx if self.include_coop_data: file_paths = get_file_paths( path_to_lobster_calc=path_to_lobster_calc, requested_files=["coopcar", "icooplist"] ) coxx = FeaturizeCOXX( path_to_coxxcar=str(file_paths.get("coopcar")), path_to_icoxxlist=str(file_paths.get("icooplist")), path_to_structure=str(structure_path), feature_type=self.feature_type, e_range=self.e_range, are_coops=True, ) df_coop = coxx.get_summarized_coxx_df() df = pd.concat([df, df_coop], axis=1) del coxx return df def _featurizecharges(self, path_to_lobster_calc: str | Path) -> pd.DataFrame: """ Featurize CHARGE.lobster.gz data that using FeaturizeCharges. :param path_to_lobster_calc: path to root LOBSTER calc directory Returns: A pandas dataframe with computed ionicity for the structure """ file_paths = get_file_paths(path_to_lobster_calc=path_to_lobster_calc, requested_files=["structure", "charge"]) if self.charge_type == "mulliken": charge_mull = FeaturizeCharges( path_to_charge=str(file_paths.get("charge")), path_to_structure=str(file_paths.get("structure")), charge_type="mulliken", ) df = charge_mull.get_df() elif self.charge_type == "loewdin": charge_loew = FeaturizeCharges( path_to_charge=str(file_paths.get("charge")), path_to_structure=str(file_paths.get("structure")), charge_type="loewdin", ) df = charge_loew.get_df() else: charge_mull = FeaturizeCharges( path_to_charge=str(file_paths.get("charge")), path_to_structure=str(file_paths.get("structure")), charge_type="mulliken", ) df_mull = charge_mull.get_df() charge_loew = FeaturizeCharges( path_to_charge=str(file_paths.get("charge")), path_to_structure=str(file_paths.get("structure")), charge_type="loewdin", ) df_loew = charge_loew.get_df() df = pd.concat([df_mull, df_loew], axis=1) return df
[docs] def get_df(self) -> pd.DataFrame: """ Generate a pandas dataframe with summary features extracted from LOBSTER files. Uses multiprocessing to speed up the process. Returns: Returns a pandas dataframe """ if self.path_to_jsons: file_name_or_path = [ os.path.join(self.path_to_jsons, f) for f in os.listdir(self.path_to_jsons) if not f.startswith("t") and not f.startswith(".") and not os.path.isdir(f) ] elif self.path_to_lobster_calcs and not self.path_to_jsons: file_name_or_path = [ os.path.join(self.path_to_lobster_calcs, f) for f in os.listdir(self.path_to_lobster_calcs) if not f.startswith("t") and not f.startswith(".") and os.path.isdir(os.path.join(self.path_to_lobster_calcs, f)) ] row = [] with ( mp.Pool(processes=self.n_jobs, maxtasksperchild=1) as pool, tqdm(total=len(file_name_or_path), desc="Generating LobsterPy summary stats") as pbar, ): for _, result in enumerate(pool.imap_unordered(self._featurizelobsterpy, file_name_or_path, chunksize=1)): pbar.update() row.append(result) df_lobsterpy = pd.concat(row) df_lobsterpy.sort_index(inplace=True) # noqa: PD002 paths = [ os.path.join(self.path_to_lobster_calcs, f) for f in os.listdir(self.path_to_lobster_calcs) if not f.startswith("t") and not f.startswith(".") and os.path.isdir(os.path.join(self.path_to_lobster_calcs, f)) ] row = [] with ( mp.Pool(processes=self.n_jobs, maxtasksperchild=1) as pool, tqdm(total=len(paths), desc="Generating COHP/COOP/COBI summary stats") as pbar, ): for _, result in enumerate(pool.imap_unordered(self._featurizecoxx, paths, chunksize=1)): pbar.update() row.append(result) df_coxx = pd.concat(row) df_coxx.sort_index(inplace=True) # noqa: PD002 row = [] with ( mp.Pool(processes=self.n_jobs, maxtasksperchild=1) as pool, tqdm(total=len(paths), desc="Generating charge based features") as pbar, ): for _, result in enumerate(pool.imap_unordered(self._featurizecharges, paths, chunksize=1)): pbar.update() row.append(result) df_charges = pd.concat(row) df_charges.sort_index(inplace=True) # noqa: PD002 return pd.concat([df_lobsterpy, df_coxx, df_charges], axis=1)
[docs] class BatchCoxxFingerprint: """ BatchFeaturizer to generate COHP/COOP/COBI fingerprints and Tanimoto index similarity matrix. :param path_to_lobster_calcs: path to root directory consisting of all lobster calc :param feature_type: set the feature type for moment features. Possible options are `bonding`, `antibonding` or `overall` :param label_list: bond labels list for which fingerprints needs to be generated. :param tanimoto: bool to state to compute tanimoto index between fingerprint objects :param normalize: bool to state to normalize the fingerprint data :param spin_type: can be `summed` or `up` or `down`. :param n_bins: sets number for bins for fingerprint objects :param e_range: range of energy relative to fermi for which moment features needs to be computed :param n_jobs: number of parallel processes to run :param fingerprint_for: Possible options are `cohp` or `cobi` or `coop`. Based on this fingerprints will be computed for COHPCAR/COOBICAR/COOPCAR.lobster files Attributes: fingerprint_df: A pandas dataframe with fingerprint objects """ def __init__( self, path_to_lobster_calcs: str | Path, feature_type: Literal["bonding", "antibonding", "overall"] = "overall", label_list: list[str] | None = None, tanimoto: bool = True, normalize: bool = True, spin_type: Literal["summed", "up", "down"] = "summed", n_bins: int = 56, e_range: list[float] = [-15.0, 0.0], n_jobs=4, fingerprint_for: Literal["cohp", "cobi", "coop"] = "cohp", ): """ Generate COHP/COOP/COBI fingerprints and pair-wise Tanimoto index similarity matrix. :param path_to_lobster_calcs: path to root directory consisting of all lobster calc :param feature_type: set the feature type for moment features. Possible options are `bonding`, `antibonding` or `overall` :param label_list: bond labels list for which fingerprints needs to be generated. :param tanimoto: bool to state to compute tanimoto index between fingerprint objects :param normalize: bool to state to normalize the fingerprint data :param spin_type: can be `summed` or `up` or `down`. :param n_bins: sets number for bins for fingerprint objects :param e_range: range of energy relative to fermi for which moment features needs to be computed :param n_jobs: number of parallel processes to run :param fingerprint_for: Possible options are `cohp` or `cobi` or `coop`. Based on this fingerprints will be computed for COHPCAR/COOBICAR/COOPCAR.lobster files """ self.path_to_lobster_calcs = path_to_lobster_calcs self.feature_type = feature_type self.tanimoto = tanimoto self.normalize = normalize self.label_list = label_list self.spin_type = spin_type self.n_bins = n_bins self.e_range = e_range self.n_jobs = n_jobs self.fingerprint_for = fingerprint_for self.fingerprint_df = self._get_fingerprints_df()
[docs] def get_similarity_matrix_df(self) -> pd.DataFrame: """ Compute pairwise similarity index for each fingerprint object in input dataframe. Returns: A Pandas dataframe """ matrix = np.full((self.fingerprint_df.shape[0], self.fingerprint_df.shape[0]), np.nan) for i, (_, col) in enumerate(self.fingerprint_df.iterrows()): for j, (_, col1) in enumerate(self.fingerprint_df.iterrows()): if self.tanimoto: simi = self._get_fp_similarity( col["COXX_FP"], col1["COXX_FP"], tanimoto=self.tanimoto, normalize=False, ) else: simi = self._get_fp_similarity( col["COXX_FP"], col1["COXX_FP"], tanimoto=False, normalize=True, ) matrix[i][j] = simi return pd.DataFrame( matrix, index=list(self.fingerprint_df.index), columns=list(self.fingerprint_df.index), )
@staticmethod def _fp_to_dict(fp: CoxxFingerprint) -> dict: """ Convert a fingerprint obj into a dictionary. :param fp: The fingerprint to be converted into a dictionary Returns: dict: A dict of the fingerprint Keys=type, Values=np.ndarray(energies, cohp) """ fp_dict = {} fp_dict[fp[2]] = np.array([fp[0], fp[1]], dtype="object").T return fp_dict @staticmethod def _get_fp_similarity( fp1: CoxxFingerprint, fp2: CoxxFingerprint, col: int = 1, pt: int | str = "All", normalize: bool = False, tanimoto: bool = True, ) -> float: """ Calculate the similarity index (dot product) of two fingerprints. :param fp1 The 1st CoxxFingerprint object :param fp2: The 2nd CoxxFingerprint object :param col: The item in the fingerprints (0:energies,1: coxxs) to take the dot product of (default is 1) :param pt: The index of the point that the dot product is to be taken (default is All) :param normalize: If True normalize the scalar product to 1 (default is False) :param tanimoto: If True will compute Tanimoto index (default is False) Raises: ValueError: If both tanimoto and normalize are set to True. Returns: Similarity index (float): The value of dot product """ fp1_dict = BatchCoxxFingerprint._fp_to_dict(fp1) if not isinstance(fp1, dict) else fp1 fp2_dict = BatchCoxxFingerprint._fp_to_dict(fp2) if not isinstance(fp2, dict) else fp2 if pt == "All": vec1 = np.array([pt[col] for pt in fp1_dict.values()]).flatten() vec2 = np.array([pt[col] for pt in fp2_dict.values()]).flatten() else: vec1 = fp1_dict[fp1[2][pt]][col] # type: ignore vec2 = fp2_dict[fp2[2][pt]][col] # type: ignore if not normalize and tanimoto: rescale = np.linalg.norm(vec1) ** 2 + np.linalg.norm(vec2) ** 2 - np.dot(vec1, vec2) elif not tanimoto and normalize: rescale = np.linalg.norm(vec1) * np.linalg.norm(vec2) elif not tanimoto and not normalize: rescale = 1.0 else: raise ValueError( "Cannot compute similarity index. Please set either normalize=True or tanimoto=True or both to False." ) return np.dot(vec1, vec2) / rescale def _fingerprint_df(self, path_to_lobster_calc: str | Path) -> pd.DataFrame: """ Get fingerprint object dataframe via FeaturizeCOXX.get_coxx_fingerprint_df. Also helps to generate the data used for fingerprint generation. :param path_to_lobster_calc: path to root LOBSTER calculation directory. Returns: A pandas dataframe with COXX fingerprint object """ if self.fingerprint_for.upper() == "COBI": file_paths = get_file_paths( path_to_lobster_calc=path_to_lobster_calc, requested_files=["structure", "cobicar", "icobilist"] ) coxxcar_path = file_paths.get("cobicar") icoxxlist_path = file_paths.get("icobilist") are_cobis = True are_coops = False elif self.fingerprint_for.upper() == "COOP": file_paths = get_file_paths( path_to_lobster_calc=path_to_lobster_calc, requested_files=["structure", "coopcar", "icooplist"] ) coxxcar_path = file_paths.get("coopcar") icoxxlist_path = file_paths.get("icooplist") are_cobis = False are_coops = True else: file_paths = get_file_paths( path_to_lobster_calc=path_to_lobster_calc, requested_files=["structure", "cohpcar", "icohplist"] ) coxxcar_path = file_paths.get("cohpcar") icoxxlist_path = file_paths.get("icohplist") are_cobis = False are_coops = False coxx = FeaturizeCOXX( path_to_coxxcar=str(coxxcar_path), path_to_icoxxlist=str(icoxxlist_path), path_to_structure=str(file_paths.get("structure")), feature_type=self.feature_type, e_range=self.e_range, are_coops=are_coops, are_cobis=are_cobis, ) return coxx.get_coxx_fingerprint_df( spin_type=self.spin_type, n_bins=self.n_bins, normalize=self.normalize, label_list=self.label_list, ) def _get_fingerprints_df(self) -> pd.DataFrame: """ Generate fingerprint objects dataframe using BatchCoxxFingerprint._fingerprint_df. Returns: A pandas dataframe with COXX fingerprint objects """ paths = [ os.path.join(self.path_to_lobster_calcs, f) for f in os.listdir(self.path_to_lobster_calcs) if not f.startswith("t") and not f.startswith(".") and os.path.isdir(os.path.join(self.path_to_lobster_calcs, f)) ] row = [] with ( mp.Pool(processes=self.n_jobs, maxtasksperchild=1) as pool, tqdm( total=len(paths), desc=f"Generating {self.fingerprint_for.upper()} fingerprints", ) as pbar, ): for _, result in enumerate(pool.imap_unordered(self._fingerprint_df, paths, chunksize=1)): pbar.update() row.append(result) df = pd.concat(row) df.sort_index(inplace=True) # noqa: PD002 return df
[docs] class BatchStructureGraphs: """ Batch Featurizer that generates structure graphs with lobster data. :param path_to_lobster_calcs: path to root directory consisting of all lobster calc :param add_additional_data_sg: bool indicating whether to include `icoop` and `icobi` data as edge properties :param which_bonds: selects which kind of bonds are analyzed. "all" is the default :param cutoff_icohp: only bonds that are stronger than cutoff_icohp * strongest ICOHP will be considered. :param noise_cutoff: if provided hardcodes the lower limit of icohps considered. :param start: start energy for bonding antibonding percent integration :param n_jobs: parallel processes to run """ def __init__( self, path_to_lobster_calcs: str | Path, add_additional_data_sg: bool = True, which_bonds: Literal["cation-anion", "all"] = "all", cutoff_icohp: float = 0.10, noise_cutoff: float = 0.1, start: float | None = None, n_jobs: int = 4, ): """ Generate structure graphs with LOBSTER data via multiprocessing. :param path_to_lobster_calcs: path to root directory consisting of all lobster calc :param add_additional_data_sg: bool indicating whether to include `icoop` and `icobi` data as edge properties :param which_bonds: selects which kind of bonds are analyzed. "all" is the default :param cutoff_icohp: only bonds that are stronger than cutoff_icohp * strongest ICOHP will be considered. :param noise_cutoff: if provided hardcodes the lower limit of icohps considered. :param start: start energy for bonding antibonding percent integration :param n_jobs: parallel processes to run """ self.path_to_lobster_calcs = path_to_lobster_calcs self.add_additional_data_sg = add_additional_data_sg self.which_bonds = which_bonds self.cutff_icohp = cutoff_icohp self.noise_cutoff = noise_cutoff self.start = start self.n_jobs = n_jobs def _get_sg_df(self, path_to_lobster_calc: str | Path) -> pd.DataFrame: """ Generate a structure graph with LOBSTER data bonding analysis data. :param path_to_lobster_calc: path to root LOBSTER calculation directory Returns: A structure graph with LOBSTER data as edge and node properties in structure graph objects """ dir_name = Path(path_to_lobster_calc) file_paths = get_file_paths( path_to_lobster_calc=path_to_lobster_calc, requested_files=["charge", "cohpcar", "icohplist", "icooplist", "icobilist", "madelung", "structure"], ) graph = LobsterGraph( path_to_poscar=str(file_paths.get("structure")), path_to_charge=str(file_paths.get("charge")), path_to_cohpcar=str(file_paths.get("cohpcar")), path_to_icohplist=str(file_paths.get("icohplist")), add_additional_data_sg=self.add_additional_data_sg, path_to_icooplist=str(file_paths.get("icooplist")), path_to_icobilist=str(file_paths.get("icobilist")), path_to_madelung=str(file_paths.get("madelung")), which_bonds=self.which_bonds, cutoff_icohp=self.cutff_icohp, noise_cutoff=self.noise_cutoff, start=self.start, ) ids = dir_name.name df = pd.DataFrame(index=[ids]) df.loc[ids, "structure_graph"] = graph.sg return df
[docs] def get_df(self) -> pd.DataFrame: """ Generate a pandas dataframe with structure graph with LOBSTER data. Uses multiprocessing to speed up the process. Returns: Returns a pandas dataframe """ paths = [ os.path.join(self.path_to_lobster_calcs, f) for f in os.listdir(self.path_to_lobster_calcs) if not f.startswith("t") and not f.startswith(".") and os.path.isdir(os.path.join(self.path_to_lobster_calcs, f)) ] row = [] with ( mp.Pool(processes=self.n_jobs, maxtasksperchild=1) as pool, tqdm(total=len(paths), desc="Generating Structure Graphs") as pbar, ): for _, result in enumerate(pool.imap_unordered(self._get_sg_df, paths, chunksize=1)): pbar.update() row.append(result) df_sg = pd.concat(row) df_sg.sort_index(inplace=True) # noqa: PD002 return df_sg
[docs] class BatchDosFeaturizer: """ BatchFeaturizer to generate Lobster DOS moment features and fingerprints. :param path_to_lobster_calcs: path to root directory consisting of all lobster calc :param add_element_dos_moments: add element dos moment features alongside orbital dos :param normalize: bool to state to normalize the fingerprint data :param n_bins: sets number for bins for fingerprint objects :param e_range: range of energy relative to fermi for which moment features needs to be computed :param n_jobs: number of parallel processes to run :param fingerprint_type: Specify fingerprint type to compute, can accept `{s/p/d/f/}summed_{pdos/tdos}` (default is summed_pdos) :param use_lso_dos: Will force featurizer to use DOSCAR.LSO.lobster instead of DOSCAR.lobster """ def __init__( self, path_to_lobster_calcs: str | Path, add_element_dos_moments: bool = False, fingerprint_type: Literal["s", "p", "d", "f", "summed_pdos", "tdos"] = "summed_pdos", normalize: bool = True, n_bins: int = 56, e_range: list[float] = [-15.0, 0.0], n_jobs=4, use_lso_dos: bool = True, ): """ Initialize BatchDosFeaturizer. :param path_to_lobster_calcs: path to root directory consisting of all lobster calc :param add_element_dos_moments: add element dos moment features alongside orbital dos :param normalize: bool to state to normalize the fingerprint data :param n_bins: sets number for bins for fingerprint objects :param e_range: range of energy relative to fermi for which moment features needs to be computed :param n_jobs: number of parallel processes to run :param fingerprint_type: Specify fingerprint type to compute, can accept `{s/p/d/f/tdos/summed_{pdos}` (default is summed_pdos) :param use_lso_dos: Will force featurizer to use DOSCAR.LSO.lobster instead of DOSCAR.lobster """ self.path_to_lobster_calcs = path_to_lobster_calcs self.add_element_dos_moments = add_element_dos_moments self.fingerprint_type = fingerprint_type self.e_range = e_range self.normalize = normalize self.n_jobs = n_jobs self.n_bins = n_bins self.use_lso_dos = use_lso_dos def _get_dos_moments_df(self, path_to_lobster_calc: str | Path) -> pd.DataFrame: """ Featurize DOSCAR.lobster data using FeaturizeDOSCAR. Returns: A pandas dataframe with computed PDOS moment features """ file_paths = get_file_paths( path_to_lobster_calc=path_to_lobster_calc, requested_files=["structure", "doscar"], use_lso_dos=self.use_lso_dos, ) featurize_dos = FeaturizeDoscar( path_to_doscar=str(file_paths.get("doscar")), path_to_structure=str(file_paths.get("structure")), add_element_dos_moments=self.add_element_dos_moments, e_range=self.e_range, ) return featurize_dos.get_df() def _get_dos_fingerprints_df(self, path_to_lobster_calc: str | Path) -> pd.DataFrame: """ Featurize DOSCAR.lobster data into fingerprints using FeaturizeDOSCAR. :param path_to_lobster_calc: path to root LOBSTER calculation directory. Returns: A pandas dataframe with DOS fingerprint objects """ file_paths = get_file_paths( path_to_lobster_calc=path_to_lobster_calc, requested_files=["structure", "doscar"], use_lso_dos=self.use_lso_dos, ) featurize_dos = FeaturizeDoscar( path_to_doscar=str(file_paths.get("doscar")), path_to_structure=str(file_paths.get("structure")), e_range=self.e_range, ) return featurize_dos.get_fingerprint_df( fp_type=self.fingerprint_type, normalize=self.normalize, n_bins=self.n_bins, )
[docs] def get_df(self) -> pd.DataFrame: """ Generate a pandas dataframe with all moment features. Moment features are PDOS (optional: element dos) center, width, skewness, kurtosis and upper band edge. Returns: A pandas dataframe with moment features """ paths = [ os.path.join(self.path_to_lobster_calcs, f) for f in os.listdir(self.path_to_lobster_calcs) if not f.startswith("t") and not f.startswith(".") and os.path.isdir(os.path.join(self.path_to_lobster_calcs, f)) ] row = [] with ( mp.Pool(processes=self.n_jobs, maxtasksperchild=1) as pool, tqdm(total=len(paths), desc="Generating PDOS moment features") as pbar, ): for _, result in enumerate(pool.imap_unordered(self._get_dos_moments_df, paths, chunksize=1)): pbar.update() row.append(result) df_dos = pd.concat(row) df_dos.sort_index(inplace=True) # noqa: PD002 return df_dos
[docs] def get_fingerprints_df(self) -> pd.DataFrame: """ Generate a pandas dataframe with DOS fingerprints. Returns: A pandas dataframe with fingerprint objects """ paths = [ os.path.join(self.path_to_lobster_calcs, f) for f in os.listdir(self.path_to_lobster_calcs) if not f.startswith("t") and not f.startswith(".") and os.path.isdir(os.path.join(self.path_to_lobster_calcs, f)) ] row = [] with ( mp.Pool(processes=self.n_jobs, maxtasksperchild=1) as pool, tqdm(total=len(paths), desc="Generating DOS fingerprints") as pbar, ): for _, result in enumerate(pool.imap_unordered(self._get_dos_fingerprints_df, paths, chunksize=1)): pbar.update() row.append(result) df_dos_fp = pd.concat(row) df_dos_fp.sort_index(inplace=True) # noqa: PD002 return df_dos_fp
[docs] class BatchIcoxxlistFeaturizer: """ BatchFeaturizer to generate BWDF-derived features from ICOXXLIST.lobster data. :param path_to_lobster_calcs: path to root directory consisting of all lobster calc :param max_length: maximum bond length for BWDF computation :param min_length: minimum bond length for BWDF computation :param normalization: normalization strategy for BWDF :param bin_width: bin width for BWDF :param bwdf_df_type: Type of BWDF dataframe to generate - "binned": Binned BWDF function. - "stats": Statistical features of BWDF function. - "sorted_bwdf": BWDF values sorted by distances, ascending. - "sorted_dists": Distances sorted by BWDF values (either only positive or negative), sorted descending by absolute values. :param sorted_dists_mode: only applies if bwdf_df_type=="sorted_dists". Corresponds to param "mode" of get_sorted_dist_df, defines whether BWDF values above or below zero are considered for distance featurization. :read_icobis: bool to state to read ICOBILIST.lobster from the path :read_icoops: bool to state to read ICOOPLIST.lobster from the path :param n_jobs: number of parallel processes to run """ def __init__( self, path_to_lobster_calcs: str | Path, normalization: Literal["formula_units", "area", "counts", "none"] = "formula_units", bin_width: float = 0.02, bwdf_df_type: Literal["binned", "stats", "sorted_bwdf", "sorted_dists"] = "stats", sorted_dists_mode: Literal["positive", "negative"] = "negative", interactions_tol: float = 1e-3, max_length: float = 6.0, min_length: float = 0.0, read_icobis: bool = False, read_icoops: bool = False, n_jobs=4, ): """ Initialize BatchIcoxxlistFeaturizer. :param path_to_lobster_calcs: path to root directory consisting of all lobster calc :param max_length: maximum bond length for BWDF computation :param min_length: minimum bond length for BWDF computation :param normalization: normalization strategy for BWDF :param bin_width: bin width for BWDF :param bwdf_df_type: Type of BWDF dataframe to generate - "binned": Binned BWDF function. - "stats": Statistical features of BWDF function. - "sorted_bwdf": BWDF values sorted by distances, ascending. - "sorted_dists": Distances sorted by BWDF values (either only positive or negative), sorted descending by absolute values. :param sorted_dists_mode: only applies if bwdf_df_type=="sorted_dists". Corresponds to param "mode" of get_sorted_dist_df, defines whether BWDF values above or below zero are considered for distance featurization. :param interactions_tol: tolerance for interactions :param read_icobis: bool to state to read ICOBILIST.lobster from the path :param read_icoops: bool to state to read ICOOPLIST.lobster from the path :param n_jobs: number of parallel processes to run """ self.path_to_lobster_calcs = path_to_lobster_calcs self.normalization = normalization self.max_length = max_length self.min_length = min_length self.bin_width = bin_width self.interactions_tol = interactions_tol self.bwdf_df_type = bwdf_df_type self.sorted_dists_mode = sorted_dists_mode self.read_icobis = read_icobis self.read_icoops = read_icoops self.n_jobs = n_jobs def _get_icoxxlist_bwdf_df(self, path_to_lobster_calc: str | Path) -> pd.DataFrame: """ Featurize ICOXXLIST data using FeaturizeCOXX. :param path_to_lobster_calc: path to root LOBSTER calculation directory Returns: A pandas dataframe with computed ICOXXLIST moment features """ if self.read_icobis: file_paths = get_file_paths( path_to_lobster_calc=path_to_lobster_calc, requested_files=["structure", "icobilist"], ) feat_icoxx = FeaturizeIcoxxlist( path_to_icoxxlist=file_paths.get("icobilist"), path_to_structure=file_paths.get("structure"), bin_width=self.bin_width, interactions_tol=self.interactions_tol, normalization=self.normalization, max_length=self.max_length, min_length=self.min_length, are_cobis=self.read_icobis, are_coops=self.read_icoops, ) elif self.read_icoops: file_paths = get_file_paths( path_to_lobster_calc=path_to_lobster_calc, requested_files=["structure", "icooplist"], ) feat_icoxx = FeaturizeIcoxxlist( path_to_icoxxlist=file_paths.get("icooplist"), path_to_structure=file_paths.get("structure"), bin_width=self.bin_width, interactions_tol=self.interactions_tol, normalization=self.normalization, max_length=self.max_length, min_length=self.min_length, are_cobis=self.read_icobis, are_coops=self.read_icoops, ) else: file_paths = get_file_paths( path_to_lobster_calc=path_to_lobster_calc, requested_files=["structure", "icohplist"], ) feat_icoxx = FeaturizeIcoxxlist( path_to_icoxxlist=file_paths.get("icohplist"), path_to_structure=file_paths.get("structure"), bin_width=self.bin_width, interactions_tol=self.interactions_tol, normalization=self.normalization, max_length=self.max_length, min_length=self.min_length, are_cobis=self.read_icobis, are_coops=self.read_icoops, ) if self.bwdf_df_type == "binned": return feat_icoxx.get_binned_bwdf_df() if self.bwdf_df_type == "sorted_bwdf": return feat_icoxx.get_sorted_bwdf_df() if self.bwdf_df_type == "sorted_dists": return feat_icoxx.get_sorted_dist_df(mode=self.sorted_dists_mode) return feat_icoxx.get_stats_df()
[docs] def get_df(self) -> pd.DataFrame: """ Generate a pandas dataframe with BWDF for all calcs. Returns: A pandas dataframe with BWDF features as columns. The features can be either binned, sorted or statistical. Depends on the "bwdf_df_type" parameter set when the class is initialized. """ paths = [ os.path.join(self.path_to_lobster_calcs, f) for f in os.listdir(self.path_to_lobster_calcs) if not f.startswith("t") and not f.startswith(".") and os.path.isdir(os.path.join(self.path_to_lobster_calcs, f)) ] row = [] with ( mp.Pool(processes=self.n_jobs, maxtasksperchild=1) as pool, tqdm(total=len(paths), desc="Generating BWDF from ICOXXLIST") as pbar, ): for _, result in enumerate(pool.imap_unordered(self._get_icoxxlist_bwdf_df, paths, chunksize=1)): pbar.update() row.append(result) df_icoxxlist = pd.concat(row) if self.bwdf_df_type in ["sorted_bwdf", "sorted_dists"]: df_icoxxlist = df_icoxxlist.fillna(value=0.0) df_icoxxlist.sort_index(inplace=True) # noqa: PD002 return df_icoxxlist