Source code for PDBNucleicAcids.MMCIF2DataFrame
"""Extract basepair data from the NDB/NAKB Data Category in mmCif files."""
from pathlib import Path
import pandas as pd
from Bio.PDB.MMCIF2Dict import MMCIF2Dict
# absolute import
from PDBNucleicAcids.utils import get_paired_segments
[docs]
def basepair_dataframe_from_mmcif(
mmcif_filepath: str | Path,
) -> pd.DataFrame | None:
"""
Return dataframe with base pairs data from a mmCif file.
Note: not all mmCif files contain Data Category "ndb_struct_na_base_pair"
Parameters
----------
mmcif_filepath : str | Path
mmCif filepath.
Returns
-------
pandas.DataFrame | None
Dataframe with base pairs data in the structure.
"""
mmcif_dict = MMCIF2Dict(str(mmcif_filepath))
try:
pair_name = mmcif_dict["_ndb_struct_na_base_pair.pair_name"]
i_chain_id = mmcif_dict["_ndb_struct_na_base_pair.i_auth_asym_id"]
i_residue_index = mmcif_dict["_ndb_struct_na_base_pair.i_auth_seq_id"]
i_residue_name = mmcif_dict["_ndb_struct_na_base_pair.i_label_comp_id"]
j_chain_id = mmcif_dict["_ndb_struct_na_base_pair.j_auth_asym_id"]
j_residue_index = mmcif_dict["_ndb_struct_na_base_pair.j_auth_seq_id"]
j_residue_name = mmcif_dict["_ndb_struct_na_base_pair.j_label_comp_id"]
base_pairs_df = pd.DataFrame(
{
"pair_name": pair_name,
"i_chain_id": i_chain_id,
"i_residue_index": list(map(int, i_residue_index)),
"i_residue_name": i_residue_name,
"j_chain_id": j_chain_id,
"j_residue_index": list(map(int, j_residue_index)),
"j_residue_name": j_residue_name,
}
)
except KeyError as e:
# in this case at least one of these keys are not present
# so there is no basepair datastructure
print(e)
return None
polymers_df = polymer_dataframe_from_mmcif(mmcif_filepath)
# Divide into two dataframes
# one has the data from the "i" side and the other from the "j" side
# of the chain
i_polymers_df = polymers_df.copy()
j_polymers_df = polymers_df.copy()
# renaming columns, adding a prefix "i_" and prefix "j_"
i_polymers_df.columns = map(lambda col: "i_" + col, i_polymers_df.columns)
j_polymers_df.columns = map(lambda col: "j_" + col, j_polymers_df.columns)
# dataframe with base pair information AND polymer information
# by inneri joining the two dataframes, from "i" and "j"
result_df = pd.merge(base_pairs_df, i_polymers_df, on="i_chain_id")
result_df = pd.merge(result_df, j_polymers_df, on="j_chain_id")
# rearranging columns by slicing the dataframe and then concatenating
i_result_df = result_df[
[
"pair_name",
"i_polymer_type",
"i_non_standard_linkage",
"i_non_standard_residue",
"i_chain_id",
"i_residue_index",
"i_residue_name",
]
]
j_result_df = result_df[
[
"j_residue_name",
"j_residue_index",
"j_chain_id",
"j_non_standard_residue",
"j_non_standard_linkage",
"j_polymer_type",
]
]
result_df = pd.concat([i_result_df, j_result_df], axis=1)
# Add a column with an index that indicates a paired segment
# i.e.
# A T 0
# C G 0
# C G 0
# T A 1
result_df["paired_segment"] = get_paired_segments(result_df)
return result_df
[docs]
def polymer_dataframe_from_mmcif(
mmcif_filepath: str | Path,
) -> pd.DataFrame | None:
"""
Dataframe with polymer data from a mmCif file.
Parameters
----------
mmcif_filepath : str | Path
mmCif filepath.
Returns
-------
pandas.DataFrame | None
Dataframe with information about each polymer in the structure.
"""
# TODO test with weird chain ids like multimeric
# i.e. "A,B,C,D"
mmcif_dict = MMCIF2Dict(str(mmcif_filepath))
try:
polymer_type = mmcif_dict["_entity_poly.type"]
non_standard_linkage = mmcif_dict["_entity_poly.nstd_linkage"]
non_standard_residue = mmcif_dict["_entity_poly.nstd_monomer"]
polymers_df = pd.DataFrame(
{
"polymer_type": polymer_type,
"non_standard_linkage": non_standard_linkage,
"non_standard_residue": non_standard_residue,
}
)
except KeyError as e:
print(e)
return None
# there are two options for chain IDs:
# ndb (nucleotide db) and pdbx (protein db X)
# sometimes it's called chain, sometimes it's called strand
if "_entity_poly.pdbx_chain_id" in mmcif_dict:
polymers_df["chain_id"] = mmcif_dict["_entity_poly.pdbx_chain_id"]
elif "_entity_poly.pdbx_strand_id" in mmcif_dict:
polymers_df["chain_id"] = mmcif_dict["_entity_poly.pdbx_strand_id"]
elif "_entity_poly.ndb_chain_id" in mmcif_dict:
polymers_df["chain_id"] = mmcif_dict["_entity_poly.ndb_chain_id"]
elif "_entity_poly.ndb_strand_id" in mmcif_dict:
polymers_df["chain_id"] = mmcif_dict["_entity_poly.ndb_strand_id"]
else:
return None
return polymers_df