Source code for PDBNucleicAcids.NucleicAcid

"""Nucleic Acids related classes and functions."""

import warnings

import pandas as pd
from Bio.Data.PDBData import nucleic_letters_3to1_extended
from Bio.Seq import Seq
from Bio.PDB.Polypeptide import is_nucleic
from Bio.PDB.PDBExceptions import PDBException
from Bio.PDB.NeighborSearch import NeighborSearch

from Bio.PDB.Structure import Structure
from Bio.PDB.Model import Model
from Bio.PDB.Chain import Chain
from Bio.PDB.Residue import Residue
from Bio.PDB.Atom import Atom

# absolute imports
from PDBNucleicAcids.BasePairRules import WatsonCrickBasePairRules


[docs] def search_paired_base( residue: Residue, pairing_rules=WatsonCrickBasePairRules(), nucleic_chain_ids: list[str] | None = None, nucleic_atoms: list[Atom] | None = None, ) -> Residue | None: """ Search in the vicinity of a given base for its paired base. Parameters ---------- residue : Bio.PDB.Residue.Residue A Biopython nucleic acid residue (nucleotide) taken from a Biopython structure. pairing_rules : optional Class instance with rules for proper pairing. `PDBNucleicAcids.BasePairsRules.WatsonCrickBasePairRules` is the default. nucleic_chain_ids : list[str], optional List of ids for nucleic acid chains. If None they will be inferred using `NABuilder` class. Default is None. nucleic_atoms : list[Atom], optional List of atoms from the nucleic acid chains. If None they will be inferred using `nucleic_chain_ids` parameter. Default is None. Returns ------- Bio.PDB.Residue.Residue | None Nucleotide that binds to the input nucleotide residue. None in the case there is no nucleotide paired to the input nucleotide. """ # from residue get to the structure structure: Structure = residue.parent.parent.parent # in case no nucleic atoms and no nucleic chain ids were given # get nucleic chains using get_polymer_dataframe if not nucleic_atoms and not nucleic_chain_ids: builder = NABuilder() nucleic_acids = builder.build_nucleic_acids( structure, standard_nucleotides=False ) nucleic_atoms = [] for na in nucleic_acids: nucleic_atoms += na.get_atoms() # in case no nucleic atoms were given but # nucleic chain ids were given or got from get_polymer_dataframe # list of all nucleic atoms, compute only once if not nucleic_atoms and nucleic_chain_ids: nucleic_atoms: list[Atom] = [ atom for atom in structure.get_atoms() if atom.parent.parent.id in nucleic_chain_ids and atom.parent != residue # not from the same residue ] # if there are no nucleic atoms except from the one input residue # then NeighborSearch will return an error if len(nucleic_atoms) == 0 or nucleic_atoms is None: return None # initialize NeighborSearch class ns = NeighborSearch(nucleic_atoms) central_atom: Atom = pairing_rules.atom_from_central_hbond(residue) # if the input residue is not a DNA base then the central atom # will be None if central_atom is None: return None # search around the central atom of the residue atom_neighborhood = ns.search(center=central_atom.coord, radius=4.0) residue_neighborhood: list[Residue] = list( {atom.parent for atom in atom_neighborhood} ) candidate_residues: list[Residue] = [] for residue_neighbour in residue_neighborhood: if pairing_rules.is_candidate(base1=residue, base2=residue_neighbour): dist = pairing_rules.distance( base1=residue, base2=residue_neighbour ) candidate_residues.append((residue_neighbour, dist)) if len(candidate_residues) > 1: # If there is more than one paired residue # TODO maybe add a scoring function instead of simple distance # TODO add a warning if there is more than one candidate # or maybe more than one candidate with similar dist or score min_tuple: tuple = min(candidate_residues, key=lambda x: x[1]) return min_tuple[0] elif candidate_residues: # only one paired nucleotide found return candidate_residues[0][0] else: # no paired nucleotides found return None
[docs] class NucleicAcid(list): """A nucleic acid is simply a list of nucleic L{Residue} objects."""
[docs] def get_seq(self) -> Seq: """ Return the nucleotide sequence as a Seq object. Returns ------- Bio.Seq.Seq Sequence of nucleotides in a continous strand. """ resname_list = [res.get_resname() for res in self] # add padding since nucleic_letters_3to1_extended requires three char # i.e. from "DT" to "DT " resname_list = [resname.ljust(3, " ") for resname in resname_list] seq = "".join( nucleic_letters_3to1_extended.get(resname, "X") for resname in resname_list ) return Seq(seq)
[docs] def get_atoms(self): """Return atoms in the nucleic acid.""" atom_list = [] for res in self: atom_list += res.get_atoms() return atom_list
[docs] def get_chain_id(self): """Return chain id of the nucleic acid.""" return self[0].parent.id
[docs] def get_na_type(self) -> str: """Return type of nucleic acid: DNA or RNA.""" dna_counter = 0 rna_counter = 0 other_counter = 0 other_list = [] for res in self: if ( is_nucleic(res, standard=False) and "O3'" in res and "O2'" not in res # deoxy-ribose ): dna_counter += 1 elif ( is_nucleic(res, standard=False) and "O3'" in res and "O2'" in res ): rna_counter += 1 else: other_counter += 1 other_list.append(res.get_resname()) if dna_counter == len(self): return "DNA" elif rna_counter == len(self): return "RNA" elif dna_counter + rna_counter == len(self): return "DNA-RNA hybrid" else: warnings.warn(f"Found these unknown residues: {other_list}") return "Unknown"
def __repr__(self): """Return string representation of the nucleic acid.""" start = self[0].get_id()[1] end = self[-1].get_id()[1] return f"<NucleicAcid chain='{self.get_chain_id()}' \ type='{self.get_na_type()}' start={start} end={end}>"
class _NABuilder: """Base class to extract nucleic acids. It checks if two consecutive residues in a chain are connected. The connectivity test is implemented by a subclass. This assumes you want both standard and non-standard nucleotides. """ def __init__(self, radius: float | int): """ Initialize the base class. Parameters ---------- radius : float | int Maximum allowed distance between P atom and O3' atom. """ self.radius = radius def _accept(self, residue: Residue, standard_nucleotides: bool) -> bool: """Check if the residue is a nucleotide (PRIVATE).""" if is_nucleic(residue, standard=standard_nucleotides): return True elif not standard_nucleotides and "O3'" in residue.child_dict: # It has an alpha carbon... # We probably need to update the hard coded list of # non-standard residues, see function is_aa for details. warnings.warn( f"Assuming residue {residue.resname} is an unknown modified \ nucleotide." ) return True else: # not a standard nucleotide so skip return False def build_nucleic_acids( self, entity: Structure | Model | Chain, standard_nucleotides: bool = False, ) -> list[NucleicAcid]: """ Build and return a list of NucleicAcid objects. Parameters ---------- entity : L{Structure}, L{Model} or L{Chain} Double-stranded nucleic acids are searched for in this object. standard_nucleotides : bool, optional Looking for standard nucleotides. The default is True. Raises ------ PDBException In case input entity is not a L{Structure}, L{Model} or L{Chain}. Returns ------- na_list : list[NucleicAcid] List if all NucleicAcid found in the input entity. """ level = entity.get_level() # Decide which entity we are dealing with if level == "S": model = entity[0] chain_list = model.get_list() elif level == "M": chain_list = entity.get_list() elif level == "C": chain_list = [entity] else: raise PDBException("Entity should be Structure, Model or Chain.") # initialize list of nucleic acids (polymers) na_list = [] for chain in chain_list: # list of residues filtered by accepted nucleotides res_list = [ res for res in chain if self._accept(res, standard_nucleotides=standard_nucleotides) ] na = None for prev_res in res_list: # look for a 5' end of the nucleic acid if not any( [self._is_connected(res, prev_res) for res in res_list] ): # residue is 5' end because it has no residue connected to # its 5' oxygen # we build the nucleic acid in 5'->3' direction # initialize nucleic acid class with already the 5' end na = NucleicAcid() na.append(prev_res) na_list.append(na) # loop over all residues i = 0 while i < len(res_list): next_res = res_list[i] i += 1 # check if previous and next residues are connected if self._is_connected(prev_res, next_res): na.append(next_res) # update previous residue prev_res = next_res # restarts the while loop i = 0 # while loop ends only when no connected next nucleotide # is found # it goes back to the for loop # which looks for another 5' end return na_list
[docs] class NABuilder(_NABuilder): """Use P--O3' distance to find connected nucletides.""" def __init__(self, radius: float | int = 1.9): """ Initialize the class. Parameters ---------- radius : float | int Maximum allowed distance between P atom and O3' atom. """ _NABuilder.__init__(self, radius) def _is_connected(self, prev_res: Residue, next_res: Residue): if not prev_res.has_id("O3'"): return False if not next_res.has_id("P"): return False o = prev_res["O3'"] p = next_res["P"] # get all disordered atom positions if o.is_disordered(): o_list = o.disordered_get_list() else: o_list = [o] if p.is_disordered(): p_list = p.disordered_get_list() else: p_list = [p] # Test all disordered atom positions if any # if not then just test the atom pair for pp in p_list: for oo in o_list: # To form a peptide bond, N and C must be # within radius and have the same altloc # identifier or one altloc blank p_altloc = pp.get_altloc() o_altloc = oo.get_altloc() if ( p_altloc == o_altloc or p_altloc == " " or o_altloc == " " ) and self._test_dist(pp, oo): # Select the disordered atoms that # are indeed bonded if o.is_disordered(): o.disordered_select(o_altloc) if p.is_disordered(): p.disordered_select(p_altloc) return True return False def _test_dist(self, o: Atom, p: Atom) -> bool: """Return True if distance between atoms<radius (PRIVATE).""" if (o - p) < self.radius: return True else: return False
[docs] class BasePair: """Pair of nucleotides.""" def __init__(self, i_res: Residue, j_res: Residue, pairing_rules): self.i_res = i_res self.j_res = j_res self.pairing_rules = pairing_rules
[docs] def get_i_res(self): """Return i-th nucleotide.""" return self.i_res
[docs] def get_j_res(self): """Return j-th nucleotide.""" return self.j_res
[docs] def get_atoms(self): """Return atoms in the nucleic acid.""" atom_list = [] atom_list += self.i_res.get_atoms() atom_list += self.j_res.get_atoms() return atom_list
[docs] def check_validity(self, pairing_rules=None): """Check for validity for base pair, using input rules.""" if pairing_rules is None: pairing_rules = self.pairing_rules return pairing_rules.is_candidate(self.i_res, self.j_res)
# TODO get other information: shear, stretch, buckle, propeller, opening
[docs] def get_stagger(self, pairing_rules=None): """Return stagger value from base pair.""" if pairing_rules is None: pairing_rules = self.pairing_rules return pairing_rules.stagger(self.i_res, self.j_res)
def __repr__(self): """Return string representation of base pair.""" i_resname = self.i_res.get_resname() j_resname = self.j_res.get_resname() return f"<BasePair i_res={i_resname} j_res={j_resname}>"
[docs] class DoubleStrandNucleicAcid(list): """List of BasePairs."""
[docs] def get_atoms(self) -> list[Atom]: """Return atoms in the double stranded nucleic acid.""" atom_list = [] for bp in self: atom_list += bp.get_atoms() return atom_list
[docs] def get_i_strand(self) -> NucleicAcid: """Get i-th strand as NucleicAcid object.""" na = NucleicAcid() for bp in self: na.append(bp.i_res) return na
[docs] def get_j_strand(self) -> NucleicAcid: """Get j-th strand as NucleicAcid objects.""" na = NucleicAcid() for bp in self: na.append(bp.j_res) return na
[docs] def get_na_complex_type(self) -> str: """ Return nucleic acid complex type. i.e. DNA/DNA, DNA/RNA, etc """ i_type = self.get_i_strand().get_na_type() j_type = self.get_j_strand().get_na_type() if i_type == "DNA" and j_type == "DNA": return "dsDNA" elif i_type == "RNA" and j_type == "RNA": return "dsRNA" else: return f"{i_type}/{j_type}"
# TODO get other information: shear, stretch, buckle, propeller, opening
[docs] def get_stagger_values(self): """Get stagger value for every base pair.""" return [bp.get_stagger() for bp in self]
[docs] def get_dataframe(self) -> pd.DataFrame: """ Return dataframe with base pairs data. Returns ------- pandas.DataFrame Dataframe with base pairs data in the structure. """ pass data = [] for bp in self: i_res = bp.i_res j_res = bp.j_res # info from base pair row: tuple[str, int] = ( i_res.parent.id, i_res.id[1], i_res.resname, j_res.resname, j_res.id[1], j_res.parent.id, ) data.append(row) # cast into dataframe df = pd.DataFrame( data, columns=[ "i_chain_id", "i_residue_index", "i_residue_name", "j_residue_name", "j_residue_index", "j_chain_id", ], ) return df
def __repr__(self): """Return string representation of the double-stranded nucleic acid.""" i_strand = self.get_i_strand() # i_start = i_strand[0].get_id()[1] # i_end = i_strand[-1].get_id()[1] j_strand = self.get_j_strand() # j_start = j_strand[0].get_id()[1] # j_end = j_strand[-1].get_id()[1] return f"<DoubleStrandNucleicAcid \ type='{self.get_na_complex_type()}' \ i-th strand='{i_strand.get_chain_id()}' \ j-th strand='{j_strand.get_chain_id()}' length={len(self)}>"
[docs] class DSNABuilder: """Base class to extract double-stranded nucleic acids. This assumes you want both standard and non-standard amino acids. """ def __init__(self, radius: float | int = 1.9): """ Initialize the class. Parameters ---------- radius : float | int Maximum allowed distance between P atom and O3' atom. """ self.radius = radius
[docs] def build_double_strands( self, entity: Structure | Model | Chain, pairing_rules=WatsonCrickBasePairRules(), ) -> list[DoubleStrandNucleicAcid]: """ Build and return a list of DoubleStrandNucleicAcid objects. Parameters ---------- entity : L{Structure}, L{Model} or L{Chain} Double-stranded nucleic acids are searched for in this object. pairing_rules : optional Class instance with rules for proper pairing. `PDBNucleicAcids.BasePairsRules.WatsonCrickBasePairRules` is the default. Returns ------- all_dsna_list : list[DoubleStrandNucleicAcid] list if all DoubleStrandNucleicAcid found in the input entity. """ # build nucleic acids builder = NABuilder() na_list = builder.build_nucleic_acids( entity, standard_nucleotides=False ) already_paired = [] all_dsna_list = [] for na1 in na_list: for na2 in na_list: if na1 == na2: continue dsna_list, already_paired = self._get_dsnas_from_nas( na1, na2, pairing_rules, already_paired ) all_dsna_list += dsna_list return all_dsna_list
def _get_dsnas_from_nas( self, na1: NucleicAcid, na2: NucleicAcid, pairing_rules, already_paired: list[Residue], ): """ Get DoubleStrandNucleicAcid's from NucleicAcid's. Return list of double-stranded nucleic acids but also list of paired residues. """ nucleic_chain_ids = [na1.get_chain_id(), na2.get_chain_id()] # from 2 nucleic acids # 1-or-many segments of paired nucleic acids dsna_list = [] prev_res = None dsna = DoubleStrandNucleicAcid() for res1 in na1: res2 = search_paired_base( res1, nucleic_chain_ids=nucleic_chain_ids, pairing_rules=pairing_rules, ) if ( res2 is None # residue is not paired or res2 not in na2 or res1 in already_paired or res2 in already_paired or ( prev_res is not None and res2 is not None and not NABuilder()._is_connected(prev_res, res2) and not NABuilder()._is_connected(res2, prev_res) # previous residue is paired and is not connected ) ): # like prev res2 should be connected to next res2 if len(dsna) > 0: # at the end of continuity, at the start of discontinuity dsna_list.append(dsna) dsna = DoubleStrandNucleicAcid() continue dsna.append(BasePair(res1, res2, pairing_rules)) already_paired += [res1, res2] prev_res = res2 if dsna: # if at the end there is still a DoubleStrandNucleicAcid # with some base pairs dsna_list.append(dsna) return dsna_list, already_paired