Source code for PDBNucleicAcids.utils
"""Utils."""
import pandas as pd
[docs]
def get_paired_segments(df: pd.DataFrame) -> list[int]:
"""
Get paired segments indexes.
A - T 0
T - A 0
A - T 0
C C -> -> [0, 0, 0, 1, 1, 1]
T A
C - G 1
G - C 1
T - A 1
Parameters
----------
df : pd.DataFrame
Dataframe with basepairs data.
Returns
-------
list[int]
An array of integers of the same length of the input dataframe.
Integers identify the uninterrupted segments of basepairs.
"""
# trivially true
# if there is only one pair base that there is only one paired segment
if len(df) == 1:
return [0]
paired_segment_ids: list[int] = []
# initialized data from previous bp and current bp as None
previous_i_chain_id: str | None = None
previous_i_residue_index: int | None = None
previous_j_chain_id: str | None = None
previous_j_residue_index: int | None = None
current_paired_segment = 0 # the first paired segment is index 0
current_i_chain_id: str | None = None
current_i_residue_index: int | None = None
current_j_chain_id: str | None = None
current_j_residue_index: int | None = None
for i, row in df.iterrows():
# update data for the current base pair (row : base pair)
current_i_chain_id: str = row["i_chain_id"]
current_i_residue_index: int = row["i_residue_index"]
current_j_chain_id: str = row["j_chain_id"]
current_j_residue_index: int = row["j_residue_index"]
if previous_i_chain_id is None:
# starting case, the very first base pair encountered
paired_segment_ids.append(current_paired_segment)
elif (
previous_i_chain_id == current_i_chain_id
and previous_j_chain_id == current_j_chain_id
and previous_i_residue_index + 1 == current_i_residue_index
and previous_j_residue_index - 1 == current_j_residue_index
):
# step case, all the other base pairs encountered
# continuity case
paired_segment_ids.append(current_paired_segment)
else:
# step case, all the other base pairs encountered
# discontinuity case
current_paired_segment += 1
paired_segment_ids.append(current_paired_segment)
# update data for the previous base pair (row : base pair)
previous_i_chain_id = current_i_chain_id
previous_i_residue_index = current_i_residue_index
previous_j_chain_id = current_j_chain_id
previous_j_residue_index = current_j_residue_index
# an array of segment indexes
return paired_segment_ids