Source code for PDBNucleicAcids.MMCIF2DataFrame

"""Extract basepair data from the NDB/NAKB Data Category in mmCif files."""

from pathlib import Path
import pandas as pd
from Bio.PDB.MMCIF2Dict import MMCIF2Dict

# absolute import
from PDBNucleicAcids.utils import get_paired_segments



[docs]
def basepair_dataframe_from_mmcif(
    mmcif_filepath: str | Path,
) -> pd.DataFrame | None:
    """
    Return dataframe with base pairs data from a mmCif file.

    Note: not all mmCif files contain Data Category "ndb_struct_na_base_pair"

    Parameters
    ----------
    mmcif_filepath : str | Path
        mmCif filepath.

    Returns
    -------
    pandas.DataFrame | None
        Dataframe with base pairs data in the structure.

    """
    mmcif_dict = MMCIF2Dict(str(mmcif_filepath))

    try:
        pair_name = mmcif_dict["_ndb_struct_na_base_pair.pair_name"]
        i_chain_id = mmcif_dict["_ndb_struct_na_base_pair.i_auth_asym_id"]
        i_residue_index = mmcif_dict["_ndb_struct_na_base_pair.i_auth_seq_id"]
        i_residue_name = mmcif_dict["_ndb_struct_na_base_pair.i_label_comp_id"]
        j_chain_id = mmcif_dict["_ndb_struct_na_base_pair.j_auth_asym_id"]
        j_residue_index = mmcif_dict["_ndb_struct_na_base_pair.j_auth_seq_id"]
        j_residue_name = mmcif_dict["_ndb_struct_na_base_pair.j_label_comp_id"]
        base_pairs_df = pd.DataFrame(
            {
                "pair_name": pair_name,
                "i_chain_id": i_chain_id,
                "i_residue_index": list(map(int, i_residue_index)),
                "i_residue_name": i_residue_name,
                "j_chain_id": j_chain_id,
                "j_residue_index": list(map(int, j_residue_index)),
                "j_residue_name": j_residue_name,
            }
        )
    except KeyError as e:
        # in this case at least one of these keys are not present
        # so there is no basepair datastructure
        print(e)
        return None

    polymers_df = polymer_dataframe_from_mmcif(mmcif_filepath)

    # Divide into two dataframes
    # one has the data from the "i" side and the other from the "j" side
    # of the chain
    i_polymers_df = polymers_df.copy()
    j_polymers_df = polymers_df.copy()

    # renaming columns, adding a prefix "i_" and prefix "j_"
    i_polymers_df.columns = map(lambda col: "i_" + col, i_polymers_df.columns)
    j_polymers_df.columns = map(lambda col: "j_" + col, j_polymers_df.columns)

    # dataframe with base pair information AND polymer information
    # by inneri joining the two dataframes, from "i" and "j"
    result_df = pd.merge(base_pairs_df, i_polymers_df, on="i_chain_id")
    result_df = pd.merge(result_df, j_polymers_df, on="j_chain_id")

    # rearranging columns by slicing the dataframe and then concatenating
    i_result_df = result_df[
        [
            "pair_name",
            "i_polymer_type",
            "i_non_standard_linkage",
            "i_non_standard_residue",
            "i_chain_id",
            "i_residue_index",
            "i_residue_name",
        ]
    ]
    j_result_df = result_df[
        [
            "j_residue_name",
            "j_residue_index",
            "j_chain_id",
            "j_non_standard_residue",
            "j_non_standard_linkage",
            "j_polymer_type",
        ]
    ]
    result_df = pd.concat([i_result_df, j_result_df], axis=1)

    # Add a column with an index that indicates a paired segment
    # i.e.
    # A  T  0
    # C  G  0
    # C  G  0
    # T  A  1
    result_df["paired_segment"] = get_paired_segments(result_df)

    return result_df




[docs]
def polymer_dataframe_from_mmcif(
    mmcif_filepath: str | Path,
) -> pd.DataFrame | None:
    """
    Dataframe with polymer data from a mmCif file.

    Parameters
    ----------
    mmcif_filepath : str | Path
        mmCif filepath.

    Returns
    -------
    pandas.DataFrame | None
        Dataframe with information about each polymer in the structure.

    """
    # TODO test with weird chain ids like multimeric
    # i.e. "A,B,C,D"
    mmcif_dict = MMCIF2Dict(str(mmcif_filepath))

    try:
        polymer_type = mmcif_dict["_entity_poly.type"]
        non_standard_linkage = mmcif_dict["_entity_poly.nstd_linkage"]
        non_standard_residue = mmcif_dict["_entity_poly.nstd_monomer"]
        polymers_df = pd.DataFrame(
            {
                "polymer_type": polymer_type,
                "non_standard_linkage": non_standard_linkage,
                "non_standard_residue": non_standard_residue,
            }
        )
    except KeyError as e:
        print(e)
        return None

    # there are two options for chain IDs:
    # ndb (nucleotide db) and pdbx (protein db X)
    # sometimes it's called chain, sometimes it's called strand
    if "_entity_poly.pdbx_chain_id" in mmcif_dict:
        polymers_df["chain_id"] = mmcif_dict["_entity_poly.pdbx_chain_id"]
    elif "_entity_poly.pdbx_strand_id" in mmcif_dict:
        polymers_df["chain_id"] = mmcif_dict["_entity_poly.pdbx_strand_id"]
    elif "_entity_poly.ndb_chain_id" in mmcif_dict:
        polymers_df["chain_id"] = mmcif_dict["_entity_poly.ndb_chain_id"]
    elif "_entity_poly.ndb_strand_id" in mmcif_dict:
        polymers_df["chain_id"] = mmcif_dict["_entity_poly.ndb_strand_id"]
    else:
        return None

    return polymers_df
Source code for PDBNucleicAcids.MMCIF2DataFrame

PDBNucleicAcids

Navigation

Related Topics