"""Nucleic Acids related classes and functions."""
import warnings
import pandas as pd
from Bio.Data.PDBData import nucleic_letters_3to1_extended
from Bio.Seq import Seq
from Bio.PDB.Polypeptide import is_nucleic
from Bio.PDB.PDBExceptions import PDBException
from Bio.PDB.NeighborSearch import NeighborSearch
from Bio.PDB.Structure import Structure
from Bio.PDB.Model import Model
from Bio.PDB.Chain import Chain
from Bio.PDB.Residue import Residue
from Bio.PDB.Atom import Atom
# absolute imports
from PDBNucleicAcids.BasePairRules import WatsonCrickBasePairRules
[docs]
def search_paired_base(
residue: Residue,
pairing_rules=WatsonCrickBasePairRules(),
nucleic_chain_ids: list[str] | None = None,
nucleic_atoms: list[Atom] | None = None,
) -> Residue | None:
"""
Search in the vicinity of a given base for its paired base.
Parameters
----------
residue : Bio.PDB.Residue.Residue
A Biopython nucleic acid residue (nucleotide) taken from a Biopython
structure.
pairing_rules : optional
Class instance with rules for proper pairing.
`PDBNucleicAcids.BasePairsRules.WatsonCrickBasePairRules`
is the default.
nucleic_chain_ids : list[str], optional
List of ids for nucleic acid chains. If None they will be
inferred using `NABuilder` class. Default is None.
nucleic_atoms : list[Atom], optional
List of atoms from the nucleic acid chains. If None they will be
inferred using `nucleic_chain_ids` parameter. Default is None.
Returns
-------
Bio.PDB.Residue.Residue | None
Nucleotide that binds to the input nucleotide residue.
None in the case there is no nucleotide paired to the input nucleotide.
"""
# from residue get to the structure
structure: Structure = residue.parent.parent.parent
# in case no nucleic atoms and no nucleic chain ids were given
# get nucleic chains using get_polymer_dataframe
if not nucleic_atoms and not nucleic_chain_ids:
builder = NABuilder()
nucleic_acids = builder.build_nucleic_acids(
structure, standard_nucleotides=False
)
nucleic_atoms = []
for na in nucleic_acids:
nucleic_atoms += na.get_atoms()
# in case no nucleic atoms were given but
# nucleic chain ids were given or got from get_polymer_dataframe
# list of all nucleic atoms, compute only once
if not nucleic_atoms and nucleic_chain_ids:
nucleic_atoms: list[Atom] = [
atom
for atom in structure.get_atoms()
if atom.parent.parent.id in nucleic_chain_ids
and atom.parent != residue # not from the same residue
]
# if there are no nucleic atoms except from the one input residue
# then NeighborSearch will return an error
if len(nucleic_atoms) == 0 or nucleic_atoms is None:
return None
# initialize NeighborSearch class
ns = NeighborSearch(nucleic_atoms)
central_atom: Atom = pairing_rules.atom_from_central_hbond(residue)
# if the input residue is not a DNA base then the central atom
# will be None
if central_atom is None:
return None
# search around the central atom of the residue
# TODO radius as parameter
atom_neighborhood = ns.search(center=central_atom.coord, radius=4.0)
residue_neighborhood: list[Residue] = list(
{atom.parent for atom in atom_neighborhood}
)
candidate_residues: list[Residue] = []
for residue_neighbour in residue_neighborhood:
if pairing_rules.is_candidate(base1=residue, base2=residue_neighbour):
dist = pairing_rules.distance(
base1=residue, base2=residue_neighbour
)
candidate_residues.append((residue_neighbour, dist))
if len(candidate_residues) > 1:
# If there is more than one paired residue
# TODO maybe add a scoring function instead of simple distance
# TODO add a warning if there is more than one candidate
# or maybe more than one candidate with similar dist or score
min_tuple: tuple = min(candidate_residues, key=lambda x: x[1])
return min_tuple[0]
elif candidate_residues:
# only one paired nucleotide found
return candidate_residues[0][0]
else:
# no paired nucleotides found
return None
[docs]
class NucleicAcid(list):
"""A nucleic acid is simply a list of nucleic L{Residue} objects."""
[docs]
def get_seq(self) -> Seq:
"""
Return the nucleotide sequence as a Seq object.
Returns
-------
Bio.Seq.Seq
Sequence of nucleotides in a continous strand.
"""
resname_list = [res.get_resname() for res in self]
# add padding since nucleic_letters_3to1_extended requires three char
# i.e. from "DT" to "DT "
resname_list = [resname.ljust(3, " ") for resname in resname_list]
seq = "".join(
nucleic_letters_3to1_extended.get(resname, "X")
for resname in resname_list
)
return Seq(seq)
[docs]
def get_atoms(self):
"""Return atoms in the nucleic acid."""
atom_list = []
for res in self:
atom_list += res.get_atoms()
return atom_list
[docs]
def get_chain_id(self):
"""Return chain id of the nucleic acid."""
return self[0].parent.id
[docs]
def get_na_type(self) -> str:
"""Return type of nucleic acid: DNA or RNA."""
dna_counter = 0
rna_counter = 0
other_counter = 0
other_list = []
for res in self:
if (
is_nucleic(res, standard=False)
and "O3'" in res
and "O2'" not in res # deoxy-ribose
):
dna_counter += 1
elif (
is_nucleic(res, standard=False)
and "O3'" in res
and "O2'" in res
):
rna_counter += 1
else:
other_counter += 1
other_list.append(res.get_resname())
if dna_counter == len(self):
return "DNA"
elif rna_counter == len(self):
return "RNA"
elif dna_counter + rna_counter == len(self):
return "DNA-RNA hybrid"
else:
warnings.warn(f"Found these unknown residues: {other_list}")
return "Unknown"
def __repr__(self):
"""Return string representation of the nucleic acid."""
start = self[0].get_id()[1]
end = self[-1].get_id()[1]
return f"<NucleicAcid chain='{self.get_chain_id()}' \
type='{self.get_na_type()}' start={start} end={end}>"
class _NABuilder:
"""Base class to extract nucleic acids.
It checks if two consecutive residues in a chain are connected.
The connectivity test is implemented by a subclass.
This assumes you want both standard and non-standard nucleotides.
"""
def __init__(self, radius: float | int):
"""
Initialize the base class.
Parameters
----------
radius : float | int
Maximum allowed distance between P atom and O3' atom.
"""
self.radius = radius
def _accept(self, residue: Residue, standard_nucleotides: bool) -> bool:
"""Check if the residue is a nucleotide (PRIVATE)."""
if is_nucleic(residue, standard=standard_nucleotides):
return True
elif not standard_nucleotides and "O3'" in residue.child_dict:
# It has an alpha carbon...
# We probably need to update the hard coded list of
# non-standard residues, see function is_aa for details.
warnings.warn(
f"Assuming residue {residue.resname} is an unknown modified \
nucleotide."
)
return True
else:
# not a standard nucleotide so skip
return False
def build_nucleic_acids(
self,
entity: Structure | Model | Chain,
standard_nucleotides: bool = False,
) -> list[NucleicAcid]:
"""
Build and return a list of NucleicAcid objects.
Parameters
----------
entity : L{Structure}, L{Model} or L{Chain}
Double-stranded nucleic acids are searched for in this object.
standard_nucleotides : bool, optional
Looking for standard nucleotides. The default is True.
Raises
------
PDBException
In case input entity is not a L{Structure}, L{Model} or L{Chain}.
Returns
-------
na_list : list[NucleicAcid]
List if all NucleicAcid found in the input entity.
"""
level = entity.get_level()
# Decide which entity we are dealing with
if level == "S":
model = entity[0]
chain_list = model.get_list()
elif level == "M":
chain_list = entity.get_list()
elif level == "C":
chain_list = [entity]
else:
raise PDBException("Entity should be Structure, Model or Chain.")
# initialize list of nucleic acids (polymers)
na_list = []
for chain in chain_list:
# list of residues filtered by accepted nucleotides
res_list = [
res
for res in chain
if self._accept(res, standard_nucleotides=standard_nucleotides)
]
na = None
for prev_res in res_list:
# look for a 5' end of the nucleic acid
if not any(
[self._is_connected(res, prev_res) for res in res_list]
):
# residue is 5' end because it has no residue connected to
# its 5' oxygen
# we build the nucleic acid in 5'->3' direction
# initialize nucleic acid class with already the 5' end
na = NucleicAcid()
na.append(prev_res)
na_list.append(na)
# loop over all residues
i = 0
while i < len(res_list):
next_res = res_list[i]
i += 1
# check if previous and next residues are connected
if self._is_connected(prev_res, next_res):
na.append(next_res)
# update previous residue
prev_res = next_res
# restarts the while loop
i = 0
# while loop ends only when no connected next nucleotide
# is found
# it goes back to the for loop
# which looks for another 5' end
return na_list
[docs]
class NABuilder(_NABuilder):
"""Use P--O3' distance to find connected nucletides."""
def __init__(self, radius: float | int = 1.9):
"""
Initialize the class.
Parameters
----------
radius : float | int
Maximum allowed distance between P atom and O3' atom.
"""
_NABuilder.__init__(self, radius)
def _is_connected(self, prev_res: Residue, next_res: Residue):
if not prev_res.has_id("O3'"):
return False
if not next_res.has_id("P"):
return False
o = prev_res["O3'"]
p = next_res["P"]
# get all disordered atom positions
if o.is_disordered():
o_list = o.disordered_get_list()
else:
o_list = [o]
if p.is_disordered():
p_list = p.disordered_get_list()
else:
p_list = [p]
# Test all disordered atom positions if any
# if not then just test the atom pair
for pp in p_list:
for oo in o_list:
# To form a peptide bond, N and C must be
# within radius and have the same altloc
# identifier or one altloc blank
p_altloc = pp.get_altloc()
o_altloc = oo.get_altloc()
if (
p_altloc == o_altloc or p_altloc == " " or o_altloc == " "
) and self._test_dist(pp, oo):
# Select the disordered atoms that
# are indeed bonded
if o.is_disordered():
o.disordered_select(o_altloc)
if p.is_disordered():
p.disordered_select(p_altloc)
return True
return False
def _test_dist(self, o: Atom, p: Atom) -> bool:
"""Return True if distance between atoms<radius (PRIVATE)."""
if (o - p) < self.radius:
return True
else:
return False
[docs]
class BasePair:
"""Pair of nucleotides."""
def __init__(self, i_res: Residue, j_res: Residue, pairing_rules):
self.i_res = i_res
self.j_res = j_res
self.pairing_rules = pairing_rules
[docs]
def get_i_res(self):
"""Return i-th nucleotide."""
return self.i_res
[docs]
def get_j_res(self):
"""Return j-th nucleotide."""
return self.j_res
[docs]
def get_atoms(self):
"""Return atoms in the nucleic acid."""
atom_list = []
atom_list += self.i_res.get_atoms()
atom_list += self.j_res.get_atoms()
return atom_list
[docs]
def check_validity(self, pairing_rules=None):
"""Check for validity for base pair, using input rules."""
if pairing_rules is None:
pairing_rules = self.pairing_rules
return pairing_rules.is_candidate(self.i_res, self.j_res)
# TODO get other information: shear, stretch, buckle, propeller, opening
[docs]
def get_stagger(self, pairing_rules=None):
"""Return stagger value from base pair."""
if pairing_rules is None:
pairing_rules = self.pairing_rules
return pairing_rules.stagger(self.i_res, self.j_res)
def __repr__(self):
"""Return string representation of base pair."""
i_resname = self.i_res.get_resname()
j_resname = self.j_res.get_resname()
return f"<BasePair i_res={i_resname} j_res={j_resname}>"
[docs]
class DoubleStrandNucleicAcid(list):
"""List of BasePairs."""
[docs]
def get_atoms(self) -> list[Atom]:
"""Return atoms in the double stranded nucleic acid."""
atom_list = []
for bp in self:
atom_list += bp.get_atoms()
return atom_list
[docs]
def get_i_strand(self) -> NucleicAcid:
"""Get i-th strand as NucleicAcid object."""
na = NucleicAcid()
for bp in self:
na.append(bp.i_res)
return na
[docs]
def get_j_strand(self) -> NucleicAcid:
"""Get j-th strand as NucleicAcid objects."""
na = NucleicAcid()
for bp in self:
na.append(bp.j_res)
return na
[docs]
def get_na_complex_type(self) -> str:
"""
Return nucleic acid complex type.
i.e. DNA/DNA, DNA/RNA, etc
"""
i_type = self.get_i_strand().get_na_type()
j_type = self.get_j_strand().get_na_type()
if i_type == "DNA" and j_type == "DNA":
return "dsDNA"
elif i_type == "RNA" and j_type == "RNA":
return "dsRNA"
else:
return f"{i_type}/{j_type}"
# TODO get other information: shear, stretch, buckle, propeller, opening
[docs]
def get_stagger_values(self):
"""Get stagger value for every base pair."""
return [bp.get_stagger() for bp in self]
[docs]
def get_dataframe(self) -> pd.DataFrame:
"""
Return dataframe with base pairs data.
Returns
-------
pandas.DataFrame
Dataframe with base pairs data in the structure.
"""
pass
data = []
for bp in self:
i_res = bp.i_res
j_res = bp.j_res
# info from base pair
row: tuple[str, int] = (
i_res.parent.id,
i_res.id[1],
i_res.resname,
j_res.resname,
j_res.id[1],
j_res.parent.id,
)
data.append(row)
# cast into dataframe
df = pd.DataFrame(
data,
columns=[
"i_chain_id",
"i_residue_index",
"i_residue_name",
"j_residue_name",
"j_residue_index",
"j_chain_id",
],
)
return df
def __repr__(self):
"""Return string representation of the double-stranded nucleic acid."""
i_strand_id = self.get_i_strand().get_chain_id()
j_strand_id = self.get_j_strand().get_chain_id()
return f"<DoubleStrandNucleicAcid \
type='{self.get_na_complex_type()}' \
strand ids='{i_strand_id}:{j_strand_id}' \
length={len(self)}>"
[docs]
class DSNABuilder:
"""Base class to extract double-stranded nucleic acids.
This assumes you want both standard and non-standard amino acids.
"""
def __init__(self, radius: float | int = 1.9):
"""
Initialize the class.
Parameters
----------
radius : float | int
Maximum allowed distance between P atom and O3' atom.
"""
self.radius = radius
[docs]
def build_double_strands(
self,
entity: Structure | Model | Chain,
pairing_rules=WatsonCrickBasePairRules(),
) -> list[DoubleStrandNucleicAcid]:
"""
Build and return a list of DoubleStrandNucleicAcid objects.
Parameters
----------
entity : L{Structure}, L{Model} or L{Chain}
Double-stranded nucleic acids are searched for in this object.
pairing_rules : optional
Class instance with rules for proper pairing.
`PDBNucleicAcids.BasePairsRules.WatsonCrickBasePairRules`
is the default.
Returns
-------
all_dsna_list : list[DoubleStrandNucleicAcid]
list if all DoubleStrandNucleicAcid found in the input entity.
"""
# build nucleic acids
builder = NABuilder()
na_list = builder.build_nucleic_acids(
entity, standard_nucleotides=False
)
already_paired = []
all_dsna_list = []
for na1 in na_list:
for na2 in na_list:
dsna_list, already_paired = self._get_dsnas_from_nas(
na1, na2, pairing_rules, already_paired
)
all_dsna_list += dsna_list
return all_dsna_list
def _get_dsnas_from_nas(
self,
na1: NucleicAcid,
na2: NucleicAcid,
pairing_rules,
already_paired: list[Residue],
):
"""
Get DoubleStrandNucleicAcid's from NucleicAcid's.
Return list of double-stranded nucleic acids but also list of paired
residues.
"""
nucleic_chain_ids = [na1.get_chain_id(), na2.get_chain_id()]
# from 2 nucleic acids
# 1-or-many segments of paired nucleic acids
dsna_list = []
dsna = DoubleStrandNucleicAcid()
for res1 in na1:
res2 = search_paired_base(
res1,
nucleic_chain_ids=nucleic_chain_ids,
pairing_rules=pairing_rules,
)
# This new algorithm to build DSNA addresses structures like 1AWC
# with many interruptions
# and nicked DNA like 1CW0 too probably
# discontinuity by unpaired or already paired bases
if (
res2 is None
or res1 in already_paired # is already paired
or res2 in already_paired # is already paired
or res2 not in na2 # not in the target nucleic acid
or NABuilder()._is_connected(res1, res2) # not connected
or NABuilder()._is_connected(res2, res1) # not connected
):
if len(dsna) > 0:
# start of discontinuity, save DSNA and start a new one
dsna_list.append(dsna)
dsna = DoubleStrandNucleicAcid()
# there is a valid base pair
else:
# base pair is not connected to the rest of the DSNA ...
if len(dsna) > 0 and not (
(
# at least one of the two residues is NOT connected
# in the "normal" order
NABuilder()._is_connected(dsna[-1].i_res, res1)
and NABuilder()._is_connected(res2, dsna[-1].j_res)
)
or (
# at least one of the two residues is NOT connected
# in the "reverse" order
NABuilder()._is_connected(dsna[-1].j_res, res1)
and NABuilder()._is_connected(res2, dsna[-1].i_res)
)
):
# save DSNA and start a new one
dsna_list.append(dsna)
dsna = DoubleStrandNucleicAcid()
# save base pair
dsna.append(BasePair(res1, res2, pairing_rules))
already_paired += [res1, res2]
# base pair is connected to the rest of DSNA
elif len(dsna) > 0 and (
# at least one of the two residues IS connected
# in the "normal" order
NABuilder()._is_connected(dsna[-1].i_res, res1)
and NABuilder()._is_connected(res2, dsna[-1].j_res)
):
# save base pair
dsna.append(BasePair(res1, res2, pairing_rules))
already_paired += [res1, res2]
# base pair is connected to the rest of DSNA
elif len(dsna) > 0 and (
# at least one of the two residues IS connected
# in the "reverse" order
NABuilder()._is_connected(dsna[-1].j_res, res1)
and NABuilder()._is_connected(res2, dsna[-1].i_res)
):
# save base pair
dsna.append(BasePair(res2, res1, pairing_rules))
already_paired += [res1, res2]
# base pair is the start of a DSNA
elif len(dsna) == 0:
# save base pair
dsna.append(BasePair(res1, res2, pairing_rules))
already_paired += [res1, res2]
if len(dsna) > 0:
# if at the end there is still a DoubleStrandNucleicAcid
# with some base pairs
dsna_list.append(dsna)
return dsna_list, already_paired