Source code for PDBNucleicAcids.NucleicAcid

"""Nucleic Acids related classes and functions."""

import warnings

import pandas as pd
from Bio.Data.PDBData import nucleic_letters_3to1_extended
from Bio.Seq import Seq
from Bio.PDB.Polypeptide import is_nucleic
from Bio.PDB.PDBExceptions import PDBException
from Bio.PDB.NeighborSearch import NeighborSearch

from Bio.PDB.Structure import Structure
from Bio.PDB.Model import Model
from Bio.PDB.Chain import Chain
from Bio.PDB.Residue import Residue
from Bio.PDB.Atom import Atom

# absolute imports
from PDBNucleicAcids.BasePairRules import dsDNAWatsonCrickBasePairRules



[docs]
def search_paired_base(
    residue: Residue,
    nucleic_chain_ids: list[str] | None = None,
    nucleic_atoms: list[Atom] | None = None,
    pairing_rules=dsDNAWatsonCrickBasePairRules,
) -> Residue | None:
    """
    Search in the vicinity of a given base for its paired base.

    Parameters
    ----------
    residue : Bio.PDB.Residue.Residue
        A Biopython nucleic acid residue (nucleotide) taken from a Biopython
        structure.
    nucleic_chain_ids : list[str], optional
        List of ids for nucleic acid chains. If None they will be
        inferred using polymer_dataframe_from_structure(). Default is None.
    nucleic_atoms : list[Atom], optional
        List of atoms from the nucleic acid chains. If None they will be
        inferred using nucleic_chain_ids(). Default is None.
    pairing_rules : optional
        Class instance with rules for proper pairing.
        PDBNucleicAcids.BasePairsRules.dsWatsonCrickBasePairsRules
        is the default. Currently it might work only with those rules

    Returns
    -------
    Bio.PDB.Residue.Residue | None
        Nucleotide that binds to the input nucleotide residue.
        None in the case there is no nucleotide paired to the input nucleotide.

    """
    # from residue get to the structure
    structure: Structure = residue.parent.parent.parent

    # in case no nucleic atoms and no nucleic chain ids were given
    # get nucleic chains using get_polymer_dataframe
    if not nucleic_atoms and not nucleic_chain_ids:
        builder = NABuilder()
        nucleic_acids = builder.build_nucleic_acids(
            structure, standard_nucleotides=False
        )
        nucleic_atoms = []
        for na in nucleic_acids:
            nucleic_atoms += na.get_atoms()

    # in case no nucleic atoms were given but
    # nucleic chain ids were given or got from get_polymer_dataframe
    # list of all nucleic atoms, compute only once
    if not nucleic_atoms and nucleic_chain_ids:
        nucleic_atoms: list[Atom] = [
            atom
            for atom in structure.get_atoms()
            if atom.parent.parent.id in nucleic_chain_ids
            and atom.parent != residue  # not from the same residue
        ]

    # if there are no nucleic atoms except from the one input residue
    # then NeighborSearch will return an error
    if len(nucleic_atoms) == 0 or nucleic_atoms is None:
        return None

    # initialize NeighborSearch class
    ns = NeighborSearch(nucleic_atoms)

    # initialize Rules class
    pairing_rules = pairing_rules()

    central_atom: Atom = pairing_rules.atom_from_central_hbond(residue)
    # if the input residue is not a DNA base then the central atom
    # will be None
    if central_atom is None:
        return None

    # search around the central atom of the residue
    atom_neighborhood = ns.search(center=central_atom.coord, radius=4.0)

    residue_neighborhood: list[Residue] = list(
        {atom.parent for atom in atom_neighborhood}
    )

    candidate_residues: list[Residue] = []

    for residue_neighbour in residue_neighborhood:
        if pairing_rules.is_candidate(base1=residue, base2=residue_neighbour):
            dist = pairing_rules.distance(
                base1=residue, base2=residue_neighbour
            )
            candidate_residues.append((residue_neighbour, dist))

    if len(candidate_residues) > 1:
        # If there is more than one paired residue
        min_tuple: tuple = min(candidate_residues, key=lambda x: x[1])
        return min_tuple[0]
    elif candidate_residues:
        # only one paired nucleotide found
        return candidate_residues[0][0]
    else:
        # no paired nucleotides found
        return None




[docs]
class NucleicAcid(list):
    """A nucleic acid is simply a list of nucleic L{Residue} objects."""


[docs]
    def get_sequence(self):
        """
        Return the nucleotide sequence as a Seq object.

        Returns
        -------
        Bio.Seq.Seq
            Sequence of nucleotides in a continous strand.

        """
        resname_list = [res.get_resname() for res in self]

        # add padding since nucleic_letters_3to1_extended requires three char
        # i.e. from "DT" to "DT "
        resname_list = [resname.ljust(3, " ") for resname in resname_list]

        seq = "".join(
            nucleic_letters_3to1_extended.get(resname, "X")
            for resname in resname_list
        )

        return Seq(seq)



[docs]
    def get_atoms(self):
        """Return atoms in the nucleic acid."""
        atom_list = []
        for res in self:
            atom_list += res.get_atoms()
        return atom_list



[docs]
    def get_chain_id(self):
        """Return chain id of the nucleic acid."""
        return self[0].parent.id



[docs]
    def get_nucleic_acid_type(self) -> str:
        """Return type of nucleic acid: DNA or RNA."""
        dna_counter = 0
        rna_counter = 0
        other_counter = 0
        other_list = []

        for res in self:
            if (
                is_nucleic(res, standard=False)
                and "O3'" in res
                and "O2'" not in res  # deoxy-ribose
            ):
                dna_counter += 1
            elif (
                is_nucleic(res, standard=False)
                and "O3'" in res
                and "O2'" in res
            ):
                rna_counter += 1
            else:
                other_counter += 1
                other_list.append(res.get_resname())

        if dna_counter == len(self):
            return "DNA"
        elif rna_counter == len(self):
            return "RNA"
        elif dna_counter + rna_counter == len(self):
            return "DNA-RNA hybrid"
        else:
            warnings.warn(f"Found these unknown residues: {other_list}")
            return "Uknown"


    def __repr__(self):
        """Return string representation of the nucleic acid."""
        start = self[0].get_id()[1]
        end = self[-1].get_id()[1]
        return f"<NucleicAcid chain={self.get_chain_id()} \
start={start} end={end}>"



class _NABuilder:
    """Base class to extract nucleic acids.

    It checks if two consecutive residues in a chain are connected.
    The connectivity test is implemented by a subclass.

    This assumes you want both standard and non-standard nucleotides.
    """

    def __init__(self, radius: float | int):
        """
        Initialize the base class.

        Parameters
        ----------
        radius : float | int
            Maximum allowed distance between P atom and O3' atom.

        """
        self.radius = radius

    def _accept(self, residue: Residue, standard_nucleotides: bool) -> bool:
        """Check if the residue is a nucleotide (PRIVATE)."""
        if is_nucleic(residue, standard=standard_nucleotides):
            return True
        elif not standard_nucleotides and "O3'" in residue.child_dict:
            # It has an alpha carbon...
            # We probably need to update the hard coded list of
            # non-standard residues, see function is_aa for details.
            warnings.warn(
                f"Assuming residue {residue.resname} is an unknown modified \
nucleotide."
            )
            return True
        else:
            # not a standard nucleotide so skip
            return False

    def build_nucleic_acids(
        self,
        entity: Structure | Model | Chain,
        standard_nucleotides: bool = True,
    ) -> list[NucleicAcid]:
        """
        Build and return a list of NucleicAcid objects.

        Parameters
        ----------
        entity : L{Structure}, L{Model} or L{Chain}
            Double-stranded nucleic acids are searched for in this object.
        standard_nucleotides : bool, optional
            Looking for standard nucleotides. The default is True.

        Raises
        ------
        PDBException
            In case input entity is not a L{Structure}, L{Model} or L{Chain}.

        Returns
        -------
        na_list : list[NucleicAcid]
            List if all NucleicAcid found in the input entity.

        """
        level = entity.get_level()
        # Decide which entity we are dealing with
        if level == "S":
            model = entity[0]
            chain_list = model.get_list()
        elif level == "M":
            chain_list = entity.get_list()
        elif level == "C":
            chain_list = [entity]
        else:
            raise PDBException("Entity should be Structure, Model or Chain.")

        # initialize list of nucleic acids (polymers)
        na_list = []
        for chain in chain_list:
            # iterator from chain
            chain_iterator = iter(chain)
            try:
                prev_res = next(chain_iterator)
                while not self._accept(prev_res, standard_nucleotides):
                    prev_res = next(chain_iterator)
            except StopIteration:
                # No interesting residues at all in this chain
                continue

            # initialize nucleic acid
            na = None
            for next_res in chain_iterator:
                if (
                    self._accept(prev_res, standard_nucleotides)
                    and self._accept(next_res, standard_nucleotides)
                    and self._is_connected(prev_res, next_res)
                ):
                    if na is None:
                        na = NucleicAcid()
                        na.append(prev_res)
                        na_list.append(na)
                    na.append(next_res)
                else:
                    # Either too far apart, or one of the residues is unwanted.
                    # End the current nucleic acid, reinitialize
                    na = None
                prev_res = next_res

        return na_list



[docs]
class NABuilder(_NABuilder):
    """Use P--O3' distance to find connected nucletides."""

    def __init__(self, radius: float | int = 1.8):
        """
        Initialize the class.

        Parameters
        ----------
        radius : float | int
            Maximum allowed distance between P atom and O3' atom.

        """
        _NABuilder.__init__(self, radius)

    def _is_connected(self, prev_res: Residue, next_res: Residue):
        if not prev_res.has_id("O3'"):
            return False
        if not next_res.has_id("P"):
            return False

        o = prev_res["O3'"]
        p = next_res["P"]

        # get all disordered atom positions
        if o.is_disordered():
            o_list = o.disordered_get_list()
        else:
            o_list = [o]
        if p.is_disordered():
            p_list = p.disordered_get_list()
        else:
            p_list = [p]

        # Test all disordered atom positions if any
        # if not then just test the atom pair
        for pp in p_list:
            for oo in o_list:
                # To form a peptide bond, N and C must be
                # within radius and have the same altloc
                # identifier or one altloc blank
                p_altloc = pp.get_altloc()
                o_altloc = oo.get_altloc()
                if (
                    p_altloc == o_altloc or p_altloc == " " or o_altloc == " "
                ) and self._test_dist(pp, oo):
                    # Select the disordered atoms that
                    # are indeed bonded
                    if o.is_disordered():
                        o.disordered_select(o_altloc)
                    if p.is_disordered():
                        p.disordered_select(p_altloc)
                    return True
        return False

    def _test_dist(self, o: Atom, p: Atom) -> bool:
        """Return True if distance between atoms<radius (PRIVATE)."""
        if (o - p) < self.radius:
            return True
        else:
            return False




[docs]
class BasePair:
    """Pair of nucleotides."""

    def __init__(self, i_res: Residue, j_res: Residue, pairing_rules):
        self.i_res = i_res
        self.j_res = j_res
        self.pairing_rules = pairing_rules


[docs]
    def get_atoms(self):
        """Return atoms in the nucleic acid."""
        atom_list = []
        atom_list += self.i_res.get_atoms()
        atom_list += self.j_res.get_atoms()
        return atom_list



[docs]
    def check_validity(self, pairing_rules=None):
        """Check for validity for base pair, using input rules."""
        if pairing_rules is None:
            pairing_rules = self.pairing_rules()
        return pairing_rules.is_candidate(self.i_res, self.j_res)


    # TODO get other information: shear, stretch, buckle, propeller, opening

[docs]
    def get_stagger(self, pairing_rules=None):
        """Return stagger value from base pair."""
        if pairing_rules is None:
            pairing_rules = self.pairing_rules()
        return pairing_rules.stagger(self.i_res, self.j_res)


    def __repr__(self):
        """Return string representation of base pair."""
        i_resname = self.i_res.get_resname()
        j_resname = self.j_res.get_resname()
        return f"<BasePair i_res={i_resname} j_res={j_resname}>"




[docs]
class DoubleStrandNucleicAcid(list):
    """List of BasePairs."""


[docs]
    def get_atoms(self) -> list[Atom]:
        """Return atoms in the double stranded nucleic acid."""
        atom_list = []
        for bp in self:
            atom_list += bp.get_atoms()
        return atom_list



[docs]
    def get_i_strand(self) -> NucleicAcid:
        """Get i-th strand as NucleicAcid object."""
        na = NucleicAcid()
        for bp in self:
            na.append(bp.i_res)
        return na



[docs]
    def get_j_strand(self) -> NucleicAcid:
        """Get j-th strand as NucleicAcid objects."""
        na = NucleicAcid()
        for bp in self:
            na.append(bp.j_res)
        return na



[docs]
    def get_nucleic_acid_complex_type(self) -> str:
        """
        Return nucleic acid complex type.

        i.e. DNA/DNA, DNA/RNA, etc
        """
        i_type = self.get_i_strand().get_nucleic_acid_type()
        j_type = self.get_j_strand().get_nucleic_acid_type()
        return f"{i_type}/{j_type}"


    # TODO get other information: shear, stretch, buckle, propeller, opening

[docs]
    def get_stagger_values(self):
        """Get stagger value for every base pair."""
        return [bp.get_stagger() for bp in self]



[docs]
    def get_dataframe(self) -> pd.DataFrame:
        """
        Return dataframe with base pairs data.

        Returns
        -------
        pandas.DataFrame
            Dataframe with base pairs data in the structure.

        """
        pass
        data = []
        for bp in self:
            i_res = bp.i_res
            j_res = bp.j_res

            # info from base pair
            row: tuple[str, int] = (
                i_res.parent.id,
                i_res.id[1],
                i_res.resname,
                j_res.resname,
                j_res.id[1],
                j_res.parent.id,
            )
            data.append(row)

        # cast into dataframe
        df = pd.DataFrame(
            data,
            columns=[
                "i_chain_id",
                "i_residue_index",
                "i_residue_name",
                "j_residue_name",
                "j_residue_index",
                "j_chain_id",
            ],
        )
        return df


    def __repr__(self):
        """Return string representation of the double-stranded nucleic acid."""
        i_strand = self.get_i_strand()
        i_start = i_strand[0].get_id()[1]
        i_end = i_strand[-1].get_id()[1]

        j_strand = self.get_j_strand()
        j_start = j_strand[0].get_id()[1]
        j_end = j_strand[-1].get_id()[1]

        return f"DoubleStrandNucleicAcid\n\
<i-th strand chain={i_strand.get_chain_id()} start={i_start} end={i_end}>\n\
<j-th strand chain={j_strand.get_chain_id()} start={j_start} end={j_end}>"




[docs]
class DSNABuilder:
    """Base class to extract double-stranded nucleic acids.

    This assumes you want both standard and non-standard amino acids.
    """

    def __init__(self, radius: float | int = 1.8):
        """
        Initialize the class.

        Parameters
        ----------
        radius : float | int
            Maximum allowed distance between P atom and O3' atom.

        """
        self.radius = radius


[docs]
    def build_double_strands(
        self,
        entity: Structure | Model | Chain,
        pairing_rules=dsDNAWatsonCrickBasePairRules,
    ) -> list[DoubleStrandNucleicAcid]:
        """
        Build and return a list of DoubleStrandNucleicAcid objects.

        Parameters
        ----------
        entity : L{Structure}, L{Model} or L{Chain}
            Double-stranded nucleic acids are searched for in this object.
        pairing_rules : optional
            Class instance with rules for proper pairing.
            PDBNucleicAcids.BasePairsRules.dsWatsonCrickBasePairsRules
            is the default. Currently it might work only with those rules.

        Returns
        -------
        all_dsna_list : list[DoubleStrandNucleicAcid]
            list if all DoubleStrandNucleicAcid found in the input entity.

        """
        # build nucleic acids
        builder = NABuilder()
        na_list = builder.build_nucleic_acids(
            entity, standard_nucleotides=False
        )

        already_paired = []
        all_dsna_list = []

        for na1 in na_list:
            for na2 in na_list:
                if na1 == na2:
                    continue

                dsna_list, already_paired = self._get_dsnas_from_nas(
                    na1, na2, pairing_rules, already_paired
                )

                all_dsna_list += dsna_list

        return all_dsna_list


    def _get_dsnas_from_nas(
        self,
        na1: NucleicAcid,
        na2: NucleicAcid,
        pairing_rules,
        already_paired: list[Residue],
    ):
        """
        Get DoubleStrandNucleicAcid's from NucleicAcid's.

        Return list of double-stranded nucleic acids but also list of paired
        residues.
        """
        nucleic_chain_ids = [na1.get_chain_id(), na2.get_chain_id()]

        # from 2 nucleic acids
        # 1-or-many segments of paired nucleic acids
        dsna_list = []

        prev_res = None

        dsna = DoubleStrandNucleicAcid()
        for res1 in na1:
            res2 = search_paired_base(
                res1,
                nucleic_chain_ids=nucleic_chain_ids,
                pairing_rules=pairing_rules,
            )

            if (
                res2 is None  # residue is not paired
                or res2 not in na2
                or res1 in already_paired
                or res2 in already_paired
                or (
                    prev_res is not None
                    and res2 is not None
                    and not NABuilder()._is_connected(prev_res, res2)
                    and not NABuilder()._is_connected(res2, prev_res)
                    # previous residue is paired and is not connected
                )
            ):
                # like prev res2 should be connected to next res2
                if len(dsna) > 0:
                    # at the end of continuity, at the start of discontinuity
                    dsna_list.append(dsna)
                    dsna = DoubleStrandNucleicAcid()
                continue

            dsna.append(BasePair(res1, res2, pairing_rules))
            already_paired += [res1, res2]

            prev_res = res2

        if dsna:
            # if at the end there is still a DoubleStrandNucleicAcid
            # with some base pairs
            dsna_list.append(dsna)

        return dsna_list, already_paired
Source code for PDBNucleicAcids.NucleicAcid

PDBNucleicAcids

Navigation

Related Topics