Source code for lobsterpy.featurize.batch

# Copyright (c) lobsterpy development team
# Distributed under the terms of a BSD 3-Clause "New" or "Revised" License

"""This module defines wrapper classes to quickly obtain similarity matrix of input fingerprint objects."""

from __future__ import annotations

import multiprocessing as mp
import os
import warnings
from pathlib import Path
from typing import Literal

import numpy as np
import pandas as pd
from tqdm.autonotebook import tqdm

from lobsterpy.featurize.core import (
    CoxxFingerprint,
    FeaturizeCharges,
    FeaturizeCOXX,
    FeaturizeDoscar,
    FeaturizeIcoxxlist,
    FeaturizeLobsterpy,
)
from lobsterpy.featurize.utils import get_file_paths
from lobsterpy.structuregraph.graph import LobsterGraph

warnings.filterwarnings("ignore")



[docs]
class BatchSummaryFeaturizer:
    """
    Batch Featurizer sets that generates summary features from lobster data.

    :param path_to_lobster_calcs: path to root directory consisting of all lobster calc
    :param path_to_jsons: path to root directory consisting of all lobster lightweight jsons
    :param feature_type: set the feature type for moment features.
        Possible options are `bonding`, `antibonding` or `overall`
    :param charge_type: set charge type used for computing ionicity. Possible options are
        `mulliken`, `loewdin` or `both`.
    :param bonds: `all_bonds` or `cation_anion_bonds`
    :param orbital_resolved: bool indicating whether LobsterPy analysis is performed orbital wise
    :param include_cobi_data: bool stating to include COBICAR.lobster features
    :param include_coop_data: bool stating to include COOPCAR.lobster features
    :param e_range: range of energy relative to fermi for which moment features needs to be computed
    :param n_jobs: parallel processes to run
    """

    def __init__(
        self,
        path_to_lobster_calcs: str | Path,
        path_to_jsons: str | Path | None = None,
        feature_type: Literal["bonding", "antibonding", "overall"] = "antibonding",
        charge_type: Literal["mulliken", "loewdin", "both"] = "both",
        bonds: Literal["all", "cation-anion"] = "all",
        orbital_resolved: bool = False,
        include_cobi_data: bool = False,
        include_coop_data: bool = False,
        e_range: list[float] = [-5.0, 0.0],
        n_jobs: int = 4,
        **analysis_kwargs,
    ):
        """
        Featurize lobster data via multiprocessing for large number of compounds.

        :param path_to_lobster_calcs: path to root directory consisting of all lobster calc
        :param path_to_jsons: path to root directory consisting of all lobster lightweight jsons
        :param feature_type: set the feature type for moment features.
            Possible options are `bonding`, `antibonding` or `overall`
        :param charge_type: set charge type used for computing ionicity. Possible options are
            `mulliken`, `loewdin` or `both`.
        :param bonds: `all` or `cation-anion` bonds
        :param orbital_resolved: bool indicating whether LobsterPy analysis is performed orbital wise
        :param include_cobi_data: bool stating to include COBICAR.lobster features
        :param include_coop_data: bool stating to include COOPCAR.lobster features
        :param e_range: range of energy relative to fermi for which moment features needs to be computed
        :param n_jobs: parallel processes to run
        :param analysis_kwargs: keyword arguments for Analysis class of Lobsterpy
        """
        # Check for valid parameters of string type
        allowed_str_inputs = {
            "charge_type": ["mulliken", "loewdin", "both"],
            "bonds": ["all", "cation-anion"],
            "feature_type": ["bonding", "antibonding", "overall"],
        }
        for param, param_string in zip([charge_type, bonds, feature_type], ["charge_type", "bonds", "feature_type"]):
            if param not in allowed_str_inputs[param_string]:
                raise ValueError(
                    f"Parameter {param_string} set to {param} but must be in "
                    f"{list(allowed_str_inputs[param_string])}."
                )

        self.path_to_lobster_calcs = path_to_lobster_calcs
        self.path_to_jsons = path_to_jsons
        self.feature_type = feature_type
        self.charge_type = charge_type
        self.bonds = bonds
        self.orbital_resolved = orbital_resolved
        self.include_cobi_data = include_cobi_data
        self.include_coop_data = include_coop_data
        self.e_range = e_range
        self.n_jobs = n_jobs
        self.analysis_kwargs = analysis_kwargs

    def _featurizelobsterpy(self, file_name_or_path: str | Path) -> pd.DataFrame:
        """
        Featurize Lobsterpy condensed bonding analysis data.

        if lightweight json file exists loads that or invokes LobsterPy Analysis class.

        :param file_name_or_path: path to the LOBSTER calc directory or
            lightweight condensed bonding analysis json file name.

        Returns:
            A pandas dataframe with ICOHP stats like mean, min, max of relevant bonds and
            madelung energies

        """
        if Path(file_name_or_path).is_file():
            featurize_lobsterpy = FeaturizeLobsterpy(
                path_to_json=file_name_or_path,
                bonds=self.bonds,
            )

        else:
            featurize_lobsterpy = FeaturizeLobsterpy(
                path_to_lobster_calc=file_name_or_path,
                bonds=self.bonds,
                orbital_resolved=self.orbital_resolved,
                **self.analysis_kwargs,
            )

        return featurize_lobsterpy.get_df()

    def _featurizecoxx(self, path_to_lobster_calc: str | Path) -> pd.DataFrame:
        """
        Featurize COHP/COBI/COOPCAR data using FeaturizeCOXX.

        :param path_to_lobster_calc: path to root LOBSTER calc directory

        Returns:
            A pandas dataframe with COHP summary stats data mainly weighted ICOHP/ICOOP/ICOBI,
            Effective interaction number and moment features (center, width, skewness and kurtosis)

        """
        file_paths = get_file_paths(
            path_to_lobster_calc=path_to_lobster_calc, requested_files=["structure", "cohpcar", "icohplist"]
        )
        structure_path = file_paths.get("structure")

        coxx = FeaturizeCOXX(
            path_to_coxxcar=str(file_paths.get("cohpcar")),
            path_to_icoxxlist=str(file_paths.get("icohplist")),
            path_to_structure=str(structure_path),
            feature_type=self.feature_type,
            e_range=self.e_range,
        )

        df = coxx.get_summarized_coxx_df()
        del coxx

        if self.include_cobi_data:
            file_paths = get_file_paths(
                path_to_lobster_calc=path_to_lobster_calc, requested_files=["cobicar", "icobilist"]
            )

            coxx = FeaturizeCOXX(
                path_to_coxxcar=str(file_paths.get("cobicar")),
                path_to_icoxxlist=str(file_paths.get("icobilist")),
                path_to_structure=str(structure_path),
                feature_type=self.feature_type,
                e_range=self.e_range,
                are_cobis=True,
            )

            df_cobi = coxx.get_summarized_coxx_df()
            df = pd.concat([df, df_cobi], axis=1)
            del coxx

        if self.include_coop_data:
            file_paths = get_file_paths(
                path_to_lobster_calc=path_to_lobster_calc, requested_files=["coopcar", "icooplist"]
            )

            coxx = FeaturizeCOXX(
                path_to_coxxcar=str(file_paths.get("coopcar")),
                path_to_icoxxlist=str(file_paths.get("icooplist")),
                path_to_structure=str(structure_path),
                feature_type=self.feature_type,
                e_range=self.e_range,
                are_coops=True,
            )

            df_coop = coxx.get_summarized_coxx_df()
            df = pd.concat([df, df_coop], axis=1)
            del coxx

        return df

    def _featurizecharges(self, path_to_lobster_calc: str | Path) -> pd.DataFrame:
        """
        Featurize CHARGE.lobster.gz data that using FeaturizeCharges.

        :param path_to_lobster_calc: path to root LOBSTER calc directory

        Returns:
            A pandas dataframe with computed ionicity for the structure

        """
        file_paths = get_file_paths(path_to_lobster_calc=path_to_lobster_calc, requested_files=["structure", "charge"])

        if self.charge_type == "mulliken":
            charge_mull = FeaturizeCharges(
                path_to_charge=str(file_paths.get("charge")),
                path_to_structure=str(file_paths.get("structure")),
                charge_type="mulliken",
            )
            df = charge_mull.get_df()
        elif self.charge_type == "loewdin":
            charge_loew = FeaturizeCharges(
                path_to_charge=str(file_paths.get("charge")),
                path_to_structure=str(file_paths.get("structure")),
                charge_type="loewdin",
            )
            df = charge_loew.get_df()
        else:
            charge_mull = FeaturizeCharges(
                path_to_charge=str(file_paths.get("charge")),
                path_to_structure=str(file_paths.get("structure")),
                charge_type="mulliken",
            )
            df_mull = charge_mull.get_df()

            charge_loew = FeaturizeCharges(
                path_to_charge=str(file_paths.get("charge")),
                path_to_structure=str(file_paths.get("structure")),
                charge_type="loewdin",
            )
            df_loew = charge_loew.get_df()

            df = pd.concat([df_mull, df_loew], axis=1)

        return df


[docs]
    def get_df(self) -> pd.DataFrame:
        """
        Generate a pandas dataframe with summary features extracted from LOBSTER files.

        Uses multiprocessing to speed up the process.

        Returns:
            Returns a pandas dataframe

        """
        if self.path_to_jsons:
            file_name_or_path = [
                os.path.join(self.path_to_jsons, f)
                for f in os.listdir(self.path_to_jsons)
                if not f.startswith("t") and not f.startswith(".") and not os.path.isdir(f)
            ]

        elif self.path_to_lobster_calcs and not self.path_to_jsons:
            file_name_or_path = [
                os.path.join(self.path_to_lobster_calcs, f)
                for f in os.listdir(self.path_to_lobster_calcs)
                if not f.startswith("t")
                and not f.startswith(".")
                and os.path.isdir(os.path.join(self.path_to_lobster_calcs, f))
            ]

        row = []
        with (
            mp.Pool(processes=self.n_jobs, maxtasksperchild=1) as pool,
            tqdm(total=len(file_name_or_path), desc="Generating LobsterPy summary stats") as pbar,
        ):
            for _, result in enumerate(pool.imap_unordered(self._featurizelobsterpy, file_name_or_path, chunksize=1)):
                pbar.update()
                row.append(result)

        df_lobsterpy = pd.concat(row)
        df_lobsterpy.sort_index(inplace=True)  # noqa: PD002

        paths = [
            os.path.join(self.path_to_lobster_calcs, f)
            for f in os.listdir(self.path_to_lobster_calcs)
            if not f.startswith("t")
            and not f.startswith(".")
            and os.path.isdir(os.path.join(self.path_to_lobster_calcs, f))
        ]

        row = []
        with (
            mp.Pool(processes=self.n_jobs, maxtasksperchild=1) as pool,
            tqdm(total=len(paths), desc="Generating COHP/COOP/COBI summary stats") as pbar,
        ):
            for _, result in enumerate(pool.imap_unordered(self._featurizecoxx, paths, chunksize=1)):
                pbar.update()
                row.append(result)

        df_coxx = pd.concat(row)
        df_coxx.sort_index(inplace=True)  # noqa: PD002

        row = []
        with (
            mp.Pool(processes=self.n_jobs, maxtasksperchild=1) as pool,
            tqdm(total=len(paths), desc="Generating charge based features") as pbar,
        ):
            for _, result in enumerate(pool.imap_unordered(self._featurizecharges, paths, chunksize=1)):
                pbar.update()
                row.append(result)

        df_charges = pd.concat(row)
        df_charges.sort_index(inplace=True)  # noqa: PD002

        return pd.concat([df_lobsterpy, df_coxx, df_charges], axis=1)





[docs]
class BatchCoxxFingerprint:
    """
    BatchFeaturizer to generate COHP/COOP/COBI fingerprints and Tanimoto index similarity matrix.

    :param path_to_lobster_calcs: path to root directory consisting of all lobster calc
    :param feature_type: set the feature type for moment features.
        Possible options are `bonding`, `antibonding` or `overall`
    :param label_list: bond labels list for which fingerprints needs to be generated.
    :param tanimoto: bool to state to compute tanimoto index between fingerprint objects
    :param normalize: bool to state to normalize the fingerprint data
    :param spin_type: can be `summed` or `up` or `down`.
    :param n_bins: sets number for bins for fingerprint objects
    :param e_range: range of energy relative to fermi for which moment features needs to be computed
    :param n_jobs: number of parallel processes to run
    :param fingerprint_for: Possible options are `cohp` or `cobi` or `coop`.
        Based on this fingerprints will be computed for COHPCAR/COOBICAR/COOPCAR.lobster files

    Attributes:
        fingerprint_df: A pandas dataframe with fingerprint objects
    """

    def __init__(
        self,
        path_to_lobster_calcs: str | Path,
        feature_type: Literal["bonding", "antibonding", "overall"] = "overall",
        label_list: list[str] | None = None,
        tanimoto: bool = True,
        normalize: bool = True,
        spin_type: Literal["summed", "up", "down"] = "summed",
        n_bins: int = 56,
        e_range: list[float] = [-15.0, 0.0],
        n_jobs=4,
        fingerprint_for: Literal["cohp", "cobi", "coop"] = "cohp",
    ):
        """
        Generate COHP/COOP/COBI fingerprints and pair-wise Tanimoto index similarity matrix.

        :param path_to_lobster_calcs: path to root directory consisting of all lobster calc
        :param feature_type: set the feature type for moment features.
            Possible options are `bonding`, `antibonding` or `overall`
        :param label_list: bond labels list for which fingerprints needs to be generated.
        :param tanimoto: bool to state to compute tanimoto index between fingerprint objects
        :param normalize: bool to state to normalize the fingerprint data
        :param spin_type: can be `summed` or `up` or `down`.
        :param n_bins: sets number for bins for fingerprint objects
        :param e_range: range of energy relative to fermi for which moment features needs to be computed
        :param n_jobs: number of parallel processes to run
        :param fingerprint_for: Possible options are `cohp` or `cobi` or `coop`.
            Based on this fingerprints will be computed for COHPCAR/COOBICAR/COOPCAR.lobster files
        """
        self.path_to_lobster_calcs = path_to_lobster_calcs
        self.feature_type = feature_type
        self.tanimoto = tanimoto
        self.normalize = normalize
        self.label_list = label_list
        self.spin_type = spin_type
        self.n_bins = n_bins
        self.e_range = e_range
        self.n_jobs = n_jobs
        self.fingerprint_for = fingerprint_for

        self.fingerprint_df = self._get_fingerprints_df()


[docs]
    def get_similarity_matrix_df(self) -> pd.DataFrame:
        """
        Compute pairwise similarity index for each fingerprint object in input dataframe.

        Returns:
             A Pandas dataframe
        """
        matrix = np.full((self.fingerprint_df.shape[0], self.fingerprint_df.shape[0]), np.nan)
        for i, (_, col) in enumerate(self.fingerprint_df.iterrows()):
            for j, (_, col1) in enumerate(self.fingerprint_df.iterrows()):
                if self.tanimoto:
                    simi = self._get_fp_similarity(
                        col["COXX_FP"],
                        col1["COXX_FP"],
                        tanimoto=self.tanimoto,
                        normalize=False,
                    )
                else:
                    simi = self._get_fp_similarity(
                        col["COXX_FP"],
                        col1["COXX_FP"],
                        tanimoto=False,
                        normalize=True,
                    )
                matrix[i][j] = simi

        return pd.DataFrame(
            matrix,
            index=list(self.fingerprint_df.index),
            columns=list(self.fingerprint_df.index),
        )


    @staticmethod
    def _fp_to_dict(fp: CoxxFingerprint) -> dict:
        """
        Convert a fingerprint obj into a dictionary.

        :param fp: The fingerprint to be converted into a dictionary

        Returns:
            dict: A dict of the fingerprint Keys=type, Values=np.ndarray(energies, cohp)
        """
        fp_dict = {}
        fp_dict[fp[2]] = np.array([fp[0], fp[1]], dtype="object").T

        return fp_dict

    @staticmethod
    def _get_fp_similarity(
        fp1: CoxxFingerprint,
        fp2: CoxxFingerprint,
        col: int = 1,
        pt: int | str = "All",
        normalize: bool = False,
        tanimoto: bool = True,
    ) -> float:
        """
        Calculate the similarity index (dot product) of two fingerprints.

        :param fp1 The 1st CoxxFingerprint object
        :param fp2: The 2nd CoxxFingerprint object
        :param col: The item in the fingerprints (0:energies,1: coxxs) to take the dot product of (default is 1)
        :param pt: The index of the point that the dot product is to be taken (default is All)
        :param normalize: If True normalize the scalar product to 1 (default is False)
        :param tanimoto: If True will compute Tanimoto index (default is False)

        Raises:
            ValueError: If both tanimoto and normalize are set to True.

        Returns:
            Similarity index (float): The value of dot product

        """
        fp1_dict = BatchCoxxFingerprint._fp_to_dict(fp1) if not isinstance(fp1, dict) else fp1

        fp2_dict = BatchCoxxFingerprint._fp_to_dict(fp2) if not isinstance(fp2, dict) else fp2

        if pt == "All":
            vec1 = np.array([pt[col] for pt in fp1_dict.values()]).flatten()
            vec2 = np.array([pt[col] for pt in fp2_dict.values()]).flatten()
        else:
            vec1 = fp1_dict[fp1[2][pt]][col]  # type: ignore
            vec2 = fp2_dict[fp2[2][pt]][col]  # type: ignore

        if not normalize and tanimoto:
            rescale = np.linalg.norm(vec1) ** 2 + np.linalg.norm(vec2) ** 2 - np.dot(vec1, vec2)

        elif not tanimoto and normalize:
            rescale = np.linalg.norm(vec1) * np.linalg.norm(vec2)

        elif not tanimoto and not normalize:
            rescale = 1.0

        else:
            raise ValueError(
                "Cannot compute similarity index. Please set either normalize=True or tanimoto=True or both to False."
            )
        return np.dot(vec1, vec2) / rescale

    def _fingerprint_df(self, path_to_lobster_calc: str | Path) -> pd.DataFrame:
        """
        Get fingerprint object dataframe via  FeaturizeCOXX.get_coxx_fingerprint_df.

        Also helps to generate the data used for fingerprint generation.

        :param path_to_lobster_calc: path to root LOBSTER calculation directory.

        Returns:
            A pandas dataframe with COXX fingerprint object

        """
        if self.fingerprint_for.upper() == "COBI":
            file_paths = get_file_paths(
                path_to_lobster_calc=path_to_lobster_calc, requested_files=["structure", "cobicar", "icobilist"]
            )

            coxxcar_path = file_paths.get("cobicar")
            icoxxlist_path = file_paths.get("icobilist")
            are_cobis = True
            are_coops = False

        elif self.fingerprint_for.upper() == "COOP":
            file_paths = get_file_paths(
                path_to_lobster_calc=path_to_lobster_calc, requested_files=["structure", "coopcar", "icooplist"]
            )

            coxxcar_path = file_paths.get("coopcar")
            icoxxlist_path = file_paths.get("icooplist")
            are_cobis = False
            are_coops = True

        else:
            file_paths = get_file_paths(
                path_to_lobster_calc=path_to_lobster_calc, requested_files=["structure", "cohpcar", "icohplist"]
            )

            coxxcar_path = file_paths.get("cohpcar")
            icoxxlist_path = file_paths.get("icohplist")
            are_cobis = False
            are_coops = False

        coxx = FeaturizeCOXX(
            path_to_coxxcar=str(coxxcar_path),
            path_to_icoxxlist=str(icoxxlist_path),
            path_to_structure=str(file_paths.get("structure")),
            feature_type=self.feature_type,
            e_range=self.e_range,
            are_coops=are_coops,
            are_cobis=are_cobis,
        )

        return coxx.get_coxx_fingerprint_df(
            spin_type=self.spin_type,
            n_bins=self.n_bins,
            normalize=self.normalize,
            label_list=self.label_list,
        )

    def _get_fingerprints_df(self) -> pd.DataFrame:
        """
        Generate fingerprint objects dataframe using BatchCoxxFingerprint._fingerprint_df.

        Returns:
            A pandas dataframe with COXX fingerprint objects

        """
        paths = [
            os.path.join(self.path_to_lobster_calcs, f)
            for f in os.listdir(self.path_to_lobster_calcs)
            if not f.startswith("t")
            and not f.startswith(".")
            and os.path.isdir(os.path.join(self.path_to_lobster_calcs, f))
        ]

        row = []
        with (
            mp.Pool(processes=self.n_jobs, maxtasksperchild=1) as pool,
            tqdm(
                total=len(paths),
                desc=f"Generating {self.fingerprint_for.upper()} fingerprints",
            ) as pbar,
        ):
            for _, result in enumerate(pool.imap_unordered(self._fingerprint_df, paths, chunksize=1)):
                pbar.update()
                row.append(result)

        df = pd.concat(row)
        df.sort_index(inplace=True)  # noqa: PD002

        return df




[docs]
class BatchStructureGraphs:
    """
    Batch Featurizer that generates structure graphs with lobster data.

    :param path_to_lobster_calcs: path to root directory consisting of all lobster calc
    :param add_additional_data_sg: bool indicating whether to include `icoop` and `icobi` data as edge properties
    :param which_bonds: selects which kind of bonds are analyzed. "all" is the default
    :param cutoff_icohp: only bonds that are stronger than cutoff_icohp * strongest ICOHP will be considered.
    :param noise_cutoff: if provided hardcodes the lower limit of icohps considered.
    :param start: start energy for bonding antibonding percent integration
    :param n_jobs: parallel processes to run

    """

    def __init__(
        self,
        path_to_lobster_calcs: str | Path,
        add_additional_data_sg: bool = True,
        which_bonds: Literal["cation-anion", "all"] = "all",
        cutoff_icohp: float = 0.10,
        noise_cutoff: float = 0.1,
        start: float | None = None,
        n_jobs: int = 4,
    ):
        """
        Generate structure graphs with LOBSTER data via multiprocessing.

        :param path_to_lobster_calcs: path to root directory consisting of all lobster calc
        :param add_additional_data_sg: bool indicating whether to include `icoop` and `icobi` data as edge properties
        :param which_bonds: selects which kind of bonds are analyzed. "all" is the default
        :param cutoff_icohp: only bonds that are stronger than cutoff_icohp * strongest ICOHP will be considered.
        :param noise_cutoff: if provided hardcodes the lower limit of icohps considered.
        :param start: start energy for bonding antibonding percent integration
        :param n_jobs: parallel processes to run

        """
        self.path_to_lobster_calcs = path_to_lobster_calcs
        self.add_additional_data_sg = add_additional_data_sg
        self.which_bonds = which_bonds
        self.cutff_icohp = cutoff_icohp
        self.noise_cutoff = noise_cutoff
        self.start = start
        self.n_jobs = n_jobs

    def _get_sg_df(self, path_to_lobster_calc: str | Path) -> pd.DataFrame:
        """
        Generate a structure graph with LOBSTER data bonding analysis data.

        :param path_to_lobster_calc: path to root LOBSTER calculation directory

        Returns:
            A  structure graph with LOBSTER data as edge and node properties in structure graph objects
        """
        dir_name = Path(path_to_lobster_calc)
        file_paths = get_file_paths(
            path_to_lobster_calc=path_to_lobster_calc,
            requested_files=["charge", "cohpcar", "icohplist", "icooplist", "icobilist", "madelung", "structure"],
        )

        graph = LobsterGraph(
            path_to_poscar=str(file_paths.get("structure")),
            path_to_charge=str(file_paths.get("charge")),
            path_to_cohpcar=str(file_paths.get("cohpcar")),
            path_to_icohplist=str(file_paths.get("icohplist")),
            add_additional_data_sg=self.add_additional_data_sg,
            path_to_icooplist=str(file_paths.get("icooplist")),
            path_to_icobilist=str(file_paths.get("icobilist")),
            path_to_madelung=str(file_paths.get("madelung")),
            which_bonds=self.which_bonds,
            cutoff_icohp=self.cutff_icohp,
            noise_cutoff=self.noise_cutoff,
            start=self.start,
        )

        ids = dir_name.name

        df = pd.DataFrame(index=[ids])

        df.loc[ids, "structure_graph"] = graph.sg

        return df


[docs]
    def get_df(self) -> pd.DataFrame:
        """
        Generate a pandas dataframe with structure graph with LOBSTER data.

        Uses multiprocessing to speed up the process.

        Returns:
            Returns a pandas dataframe

        """
        paths = [
            os.path.join(self.path_to_lobster_calcs, f)
            for f in os.listdir(self.path_to_lobster_calcs)
            if not f.startswith("t")
            and not f.startswith(".")
            and os.path.isdir(os.path.join(self.path_to_lobster_calcs, f))
        ]
        row = []
        with (
            mp.Pool(processes=self.n_jobs, maxtasksperchild=1) as pool,
            tqdm(total=len(paths), desc="Generating Structure Graphs") as pbar,
        ):
            for _, result in enumerate(pool.imap_unordered(self._get_sg_df, paths, chunksize=1)):
                pbar.update()
                row.append(result)

        df_sg = pd.concat(row)
        df_sg.sort_index(inplace=True)  # noqa: PD002

        return df_sg





[docs]
class BatchDosFeaturizer:
    """
    BatchFeaturizer to generate Lobster DOS moment features and fingerprints.

    :param path_to_lobster_calcs: path to root directory consisting of all lobster calc
    :param add_element_dos_moments: add element dos moment features alongside orbital dos
    :param normalize: bool to state to normalize the fingerprint data
    :param n_bins: sets number for bins for fingerprint objects
    :param e_range: range of energy relative to fermi for which moment features needs to be computed
    :param n_jobs: number of parallel processes to run
    :param fingerprint_type: Specify fingerprint type to compute, can accept `{s/p/d/f/}summed_{pdos/tdos}`
        (default is summed_pdos)
    :param use_lso_dos: Will force featurizer to use DOSCAR.LSO.lobster instead of DOSCAR.lobster

    """

    def __init__(
        self,
        path_to_lobster_calcs: str | Path,
        add_element_dos_moments: bool = False,
        fingerprint_type: Literal["s", "p", "d", "f", "summed_pdos", "tdos"] = "summed_pdos",
        normalize: bool = True,
        n_bins: int = 56,
        e_range: list[float] = [-15.0, 0.0],
        n_jobs=4,
        use_lso_dos: bool = True,
    ):
        """
        Initialize BatchDosFeaturizer.

        :param path_to_lobster_calcs: path to root directory consisting of all lobster calc
        :param add_element_dos_moments: add element dos moment features alongside orbital dos
        :param normalize: bool to state to normalize the fingerprint data
        :param n_bins: sets number for bins for fingerprint objects
        :param e_range: range of energy relative to fermi for which moment features needs to be computed
        :param n_jobs: number of parallel processes to run
        :param fingerprint_type: Specify fingerprint type to compute, can accept `{s/p/d/f/tdos/summed_{pdos}`
            (default is summed_pdos)
        :param use_lso_dos: Will force featurizer to use DOSCAR.LSO.lobster instead of DOSCAR.lobster
        """
        self.path_to_lobster_calcs = path_to_lobster_calcs
        self.add_element_dos_moments = add_element_dos_moments
        self.fingerprint_type = fingerprint_type
        self.e_range = e_range
        self.normalize = normalize
        self.n_jobs = n_jobs
        self.n_bins = n_bins
        self.use_lso_dos = use_lso_dos

    def _get_dos_moments_df(self, path_to_lobster_calc: str | Path) -> pd.DataFrame:
        """
        Featurize DOSCAR.lobster data using FeaturizeDOSCAR.

        Returns:
            A pandas dataframe with computed PDOS moment features
        """
        file_paths = get_file_paths(
            path_to_lobster_calc=path_to_lobster_calc,
            requested_files=["structure", "doscar"],
            use_lso_dos=self.use_lso_dos,
        )

        featurize_dos = FeaturizeDoscar(
            path_to_doscar=str(file_paths.get("doscar")),
            path_to_structure=str(file_paths.get("structure")),
            add_element_dos_moments=self.add_element_dos_moments,
            e_range=self.e_range,
        )

        return featurize_dos.get_df()

    def _get_dos_fingerprints_df(self, path_to_lobster_calc: str | Path) -> pd.DataFrame:
        """
        Featurize DOSCAR.lobster data into fingerprints using FeaturizeDOSCAR.

        :param path_to_lobster_calc: path to root LOBSTER calculation directory.

        Returns:
            A pandas dataframe with DOS fingerprint objects
        """
        file_paths = get_file_paths(
            path_to_lobster_calc=path_to_lobster_calc,
            requested_files=["structure", "doscar"],
            use_lso_dos=self.use_lso_dos,
        )

        featurize_dos = FeaturizeDoscar(
            path_to_doscar=str(file_paths.get("doscar")),
            path_to_structure=str(file_paths.get("structure")),
            e_range=self.e_range,
        )

        return featurize_dos.get_fingerprint_df(
            fp_type=self.fingerprint_type,
            normalize=self.normalize,
            n_bins=self.n_bins,
        )


[docs]
    def get_df(self) -> pd.DataFrame:
        """
        Generate a pandas dataframe with all moment features.

        Moment features are PDOS (optional: element dos) center, width, skewness, kurtosis
        and upper band edge.

        Returns:
            A pandas dataframe with moment features
        """
        paths = [
            os.path.join(self.path_to_lobster_calcs, f)
            for f in os.listdir(self.path_to_lobster_calcs)
            if not f.startswith("t")
            and not f.startswith(".")
            and os.path.isdir(os.path.join(self.path_to_lobster_calcs, f))
        ]
        row = []
        with (
            mp.Pool(processes=self.n_jobs, maxtasksperchild=1) as pool,
            tqdm(total=len(paths), desc="Generating PDOS moment features") as pbar,
        ):
            for _, result in enumerate(pool.imap_unordered(self._get_dos_moments_df, paths, chunksize=1)):
                pbar.update()
                row.append(result)

        df_dos = pd.concat(row)
        df_dos.sort_index(inplace=True)  # noqa: PD002

        return df_dos



[docs]
    def get_fingerprints_df(self) -> pd.DataFrame:
        """
        Generate a pandas dataframe with DOS fingerprints.

        Returns:
            A pandas dataframe with fingerprint objects
        """
        paths = [
            os.path.join(self.path_to_lobster_calcs, f)
            for f in os.listdir(self.path_to_lobster_calcs)
            if not f.startswith("t")
            and not f.startswith(".")
            and os.path.isdir(os.path.join(self.path_to_lobster_calcs, f))
        ]
        row = []
        with (
            mp.Pool(processes=self.n_jobs, maxtasksperchild=1) as pool,
            tqdm(total=len(paths), desc="Generating DOS fingerprints") as pbar,
        ):
            for _, result in enumerate(pool.imap_unordered(self._get_dos_fingerprints_df, paths, chunksize=1)):
                pbar.update()
                row.append(result)

        df_dos_fp = pd.concat(row)
        df_dos_fp.sort_index(inplace=True)  # noqa: PD002

        return df_dos_fp





[docs]
class BatchIcoxxlistFeaturizer:
    """
    BatchFeaturizer to generate BWDF-derived features from ICOXXLIST.lobster data.

    :param path_to_lobster_calcs: path to root directory consisting of all lobster calc
    :param max_length: maximum bond length for BWDF computation
    :param min_length: minimum bond length for BWDF computation
    :param normalization: normalization strategy for BWDF
    :param bin_width: bin width for BWDF
    :param bwdf_df_type: Type of BWDF dataframe to generate

        - "binned": Binned BWDF function.
        - "stats": Statistical features of BWDF function.
        - "sorted_bwdf": BWDF values sorted by distances, ascending.
        - "sorted_dists": Distances sorted by BWDF values (either only positive or negative),
          sorted descending by absolute values.
    :param sorted_dists_mode: only applies if bwdf_df_type=="sorted_dists".
        Corresponds to param "mode" of get_sorted_dist_df, defines whether BWDF values above or
        below zero are considered for distance featurization.
    :read_icobis: bool to state to read ICOBILIST.lobster from the path
    :read_icoops: bool to state to read ICOOPLIST.lobster from the path
    :param n_jobs: number of parallel processes to run

    """

    def __init__(
        self,
        path_to_lobster_calcs: str | Path,
        normalization: Literal["formula_units", "area", "counts", "none"] = "formula_units",
        bin_width: float = 0.02,
        bwdf_df_type: Literal["binned", "stats", "sorted_bwdf", "sorted_dists"] = "stats",
        sorted_dists_mode: Literal["positive", "negative"] = "negative",
        interactions_tol: float = 1e-3,
        max_length: float = 6.0,
        min_length: float = 0.0,
        read_icobis: bool = False,
        read_icoops: bool = False,
        n_jobs=4,
    ):
        """
        Initialize BatchIcoxxlistFeaturizer.

        :param path_to_lobster_calcs: path to root directory consisting of all lobster calc
        :param max_length: maximum bond length for BWDF computation
        :param min_length: minimum bond length for BWDF computation
        :param normalization: normalization strategy for BWDF
        :param bin_width: bin width for BWDF
        :param bwdf_df_type: Type of BWDF dataframe to generate

            - "binned": Binned BWDF function.
            - "stats": Statistical features of BWDF function.
            - "sorted_bwdf": BWDF values sorted by distances, ascending.
            - "sorted_dists": Distances sorted by BWDF values (either only positive or negative),
              sorted descending by absolute values.
        :param sorted_dists_mode: only applies if bwdf_df_type=="sorted_dists".
            Corresponds to param "mode" of get_sorted_dist_df, defines whether BWDF values above or
            below zero are considered for distance featurization.
        :param interactions_tol: tolerance for interactions
        :param read_icobis: bool to state to read ICOBILIST.lobster from the path
        :param read_icoops: bool to state to read ICOOPLIST.lobster from the path
        :param n_jobs: number of parallel processes to run
        """
        self.path_to_lobster_calcs = path_to_lobster_calcs
        self.normalization = normalization
        self.max_length = max_length
        self.min_length = min_length
        self.bin_width = bin_width
        self.interactions_tol = interactions_tol
        self.bwdf_df_type = bwdf_df_type
        self.sorted_dists_mode = sorted_dists_mode
        self.read_icobis = read_icobis
        self.read_icoops = read_icoops
        self.n_jobs = n_jobs

    def _get_icoxxlist_bwdf_df(self, path_to_lobster_calc: str | Path) -> pd.DataFrame:
        """
        Featurize ICOXXLIST data using FeaturizeCOXX.

        :param path_to_lobster_calc: path to root LOBSTER calculation directory

        Returns:
            A pandas dataframe with computed ICOXXLIST moment features
        """
        if self.read_icobis:
            file_paths = get_file_paths(
                path_to_lobster_calc=path_to_lobster_calc,
                requested_files=["structure", "icobilist"],
            )
            feat_icoxx = FeaturizeIcoxxlist(
                path_to_icoxxlist=file_paths.get("icobilist"),
                path_to_structure=file_paths.get("structure"),
                bin_width=self.bin_width,
                interactions_tol=self.interactions_tol,
                normalization=self.normalization,
                max_length=self.max_length,
                min_length=self.min_length,
                are_cobis=self.read_icobis,
                are_coops=self.read_icoops,
            )
        elif self.read_icoops:
            file_paths = get_file_paths(
                path_to_lobster_calc=path_to_lobster_calc,
                requested_files=["structure", "icooplist"],
            )
            feat_icoxx = FeaturizeIcoxxlist(
                path_to_icoxxlist=file_paths.get("icooplist"),
                path_to_structure=file_paths.get("structure"),
                bin_width=self.bin_width,
                interactions_tol=self.interactions_tol,
                normalization=self.normalization,
                max_length=self.max_length,
                min_length=self.min_length,
                are_cobis=self.read_icobis,
                are_coops=self.read_icoops,
            )
        else:
            file_paths = get_file_paths(
                path_to_lobster_calc=path_to_lobster_calc,
                requested_files=["structure", "icohplist"],
            )
            feat_icoxx = FeaturizeIcoxxlist(
                path_to_icoxxlist=file_paths.get("icohplist"),
                path_to_structure=file_paths.get("structure"),
                bin_width=self.bin_width,
                interactions_tol=self.interactions_tol,
                normalization=self.normalization,
                max_length=self.max_length,
                min_length=self.min_length,
                are_cobis=self.read_icobis,
                are_coops=self.read_icoops,
            )

        if self.bwdf_df_type == "binned":
            return feat_icoxx.get_binned_bwdf_df()
        if self.bwdf_df_type == "sorted_bwdf":
            return feat_icoxx.get_sorted_bwdf_df()
        if self.bwdf_df_type == "sorted_dists":
            return feat_icoxx.get_sorted_dist_df(mode=self.sorted_dists_mode)
        return feat_icoxx.get_stats_df()


[docs]
    def get_df(self) -> pd.DataFrame:
        """
        Generate a pandas dataframe with BWDF for all calcs.

        Returns:
            A pandas dataframe with BWDF features as columns.
            The features can be either binned, sorted or statistical.
            Depends on the "bwdf_df_type" parameter set when the class is initialized.
        """
        paths = [
            os.path.join(self.path_to_lobster_calcs, f)
            for f in os.listdir(self.path_to_lobster_calcs)
            if not f.startswith("t")
            and not f.startswith(".")
            and os.path.isdir(os.path.join(self.path_to_lobster_calcs, f))
        ]
        row = []
        with (
            mp.Pool(processes=self.n_jobs, maxtasksperchild=1) as pool,
            tqdm(total=len(paths), desc="Generating BWDF from ICOXXLIST") as pbar,
        ):
            for _, result in enumerate(pool.imap_unordered(self._get_icoxxlist_bwdf_df, paths, chunksize=1)):
                pbar.update()
                row.append(result)

        df_icoxxlist = pd.concat(row)
        if self.bwdf_df_type in ["sorted_bwdf", "sorted_dists"]:
            df_icoxxlist = df_icoxxlist.fillna(value=0.0)
        df_icoxxlist.sort_index(inplace=True)  # noqa: PD002

        return df_icoxxlist