Module proteinflow.metrics
Metrics for evaluating prediction quality.
Expand source code
"""Metrics for evaluating prediction quality."""
import os
import biotite.structure.io as bsio
import numpy as np
import torch
from torch.nn import functional as F
from tqdm import tqdm
from proteinflow.extra import requires_extra
try:
import blosum as bl
except ImportError:
pass
try:
import esm
except ImportError:
pass
try:
from tmtools import tm_align
except ImportError:
pass
try:
import ablang
except ImportError:
pass
try:
from igfold import IgFoldRunner
except ImportError:
pass
try:
from ImmuneBuilder import ABodyBuilder2, NanoBodyBuilder2, TCRBuilder2
except ImportError:
pass
@requires_extra("blosum")
def blosum62_score(seq_before, seq_after):
"""Calculate the BLOSUM62 score between two sequences.
Parameters
----------
seq_before : str
The sequence before the mutation
seq_after : str
The sequence after the mutation
Returns
-------
score : int
The BLOSUM62 score between the two sequences
"""
assert len(seq_before) == len(seq_after)
matrix = bl.BLOSUM(62)
score = 0
for x_before, x_after in zip(seq_before, seq_after):
score += matrix[x_before][x_after]
return score
@requires_extra("blosum")
def long_repeat_num(seq, thr=5):
"""Calculate the number of long repeats in a sequence.
Parameters
----------
seq : str
The sequence to be evaluated.
thr : int, default 5
The threshold of the length of a repeat.
Returns
-------
num : int
The number of long repeats in the sequence.
"""
arr = np.array(list(seq))
# Find the indices where the array changes its value
changes = np.flatnonzero(arr[:-1] != arr[1:])
# Split the array into consecutive groups
groups = np.split(arr, changes + 1)
# Filter groups that are longer than N
long_groups = filter(lambda g: len(g) > thr, groups)
# Count the number of long groups
count = sum(1 for _ in long_groups)
return count
@requires_extra("esm", install_name="fair-esm")
def _get_esm_model(esm_model_name):
"""Get ESM model, batch converter and tok_to_idx dictionary."""
model_dict = {
"esm2_t6_8M_UR50D": esm.pretrained.esm2_t6_8M_UR50D,
"esm2_t12_35M_UR50D": esm.pretrained.esm2_t12_35M_UR50D,
"esm2_t30_150M_UR50D": esm.pretrained.esm2_t30_150M_UR50D,
"esm2_t33_650M_UR50D": esm.pretrained.esm2_t33_650M_UR50D,
"esm2_t36_3B_UR50D": esm.pretrained.esm2_t36_3B_UR50D,
"esm2_t48_15B_UR50D": esm.pretrained.esm2_t48_15B_UR50D,
}
esm_model, alphabet = model_dict[esm_model_name]()
if torch.cuda.is_available():
esm_model.to("cuda")
batch_converter = alphabet.get_batch_converter()
tok_to_idx = alphabet.tok_to_idx
return esm_model, batch_converter, tok_to_idx
@requires_extra("ablang")
def ablang_pll(
sequence,
predict_mask,
ablang_model_name="heavy",
average=False,
):
"""Compute pseudo log likelihood.
Note that you need to install `ablang` (see https://github.com/oxpig/AbLang/tree/main).
Parameters
----------
sequence : str
Chain sequence (string of amino acid codes)
predict_mask : np.ndarray
Predict mask corresponding to the sequence (array of 0 and 1 where 1 indicates a predicted residue)
ablang_model_name : {"heavy", "light"}, default "heavy"
Name of the AbLang model to use
average : bool, default False
Whether to average the pseudo log likelihood over the residues
Returns
-------
pll: float
Pseudo log likelihood
"""
ablang_model = ablang.pretrained(
ablang_model_name
) # Use "light" if you are working with light chains
ablang_model.freeze()
sequences = []
sequence = list(sequence)
predict_idx = np.where(predict_mask)[0]
for i in predict_idx:
sequences.append("".join(sequence[:i]) + "*" + "".join(sequence[i + 1 :]))
logits = ablang_model(sequences, mode="likelihood")[:, 1:]
exp_logits = np.exp(logits)
prob = exp_logits / exp_logits.sum(axis=-1, keepdims=True)
true_idx = [
ablang_model.tokenizer.vocab_to_token[x] - 1
for x in np.array(sequence)[predict_idx]
]
prob = prob[range(prob.shape[0]), predict_idx, true_idx]
pll = np.log(prob).sum()
if average:
pll /= len(predict_idx)
return pll
@requires_extra("esm", install_name="fair-esm")
def esm_pll(
chain_sequences,
predict_masks,
esm_model_name="esm2_t30_150M_UR50D",
esm_model_objects=None,
average=False,
):
"""Compute pseudo log likelihood.
Parameters
----------
chain_sequences : list of str
List of chain sequences (strings of amino acid codes)
predict_masks : list of np.ndarray
List of predict masks corresponding to the sequences (arrays of 0 and 1 where 1 indicates a predicted residue)
esm_model_name : str, default "esm2_t30_150M_UR50D"
Name of the ESM-2 model to use
esm_model_objects : tuple, optional
Tuple of ESM-2 model, batch converter and tok_to_idx dictionary (if not None, `esm_model_name` will be ignored)
average : bool, default False
Whether to average the pseudo log likelihood over the residues
Returns
-------
pll: float
Pseudo log likelihood
"""
predict_mask = []
for mask in predict_masks:
predict_mask.append(mask)
predict_mask.append(np.zeros(2))
predict_mask = np.concatenate(predict_mask, axis=0)
predict_idx = np.where(predict_mask)[0]
sequence = []
for i, seq in enumerate(chain_sequences):
sequence += list(seq)
if i != len(chain_sequences) - 1:
sequence += ["<eos>", "<cls>"]
if esm_model_objects is None:
esm_model, batch_converter, tok_to_idx = _get_esm_model(esm_model_name)
else:
esm_model, batch_converter, tok_to_idx = esm_model_objects
pll = 0
for i in predict_idx:
sequence_ = "".join(sequence[:i]) + "<mask>" + "".join(sequence[i + 1 :])
_, _, batch_tokens = batch_converter([(0, sequence_)])
if torch.cuda.is_available():
batch_tokens = batch_tokens.to("cuda")
with torch.no_grad():
results = esm_model(batch_tokens, repr_layers=[6], return_contacts=False)
logits = results["logits"][0, i + 1].detach().cpu()
tok_idx = tok_to_idx[sequence[i]]
prob = F.softmax(logits[4:24], dim=-1)[tok_idx - 4]
pll += torch.log(prob).item()
if average:
pll /= len(predict_idx)
return pll
def ca_rmsd(coordinates1, coordinates2):
"""Calculate CA RMSD between two structures.
Parameters
----------
coordinates1 : np.ndarray
The CA coordinates array of the first structure, shaped `(L, 3)`
coordinates2 : ProteinEntry
The CA coordinates array of the second structure, shaped `(L, 3)`
Returns
-------
rmsd : float
The RMSD between the two structures
"""
return np.sqrt(((coordinates1 - coordinates2) ** 2).sum(axis=-1).mean())
@requires_extra("tmtools")
def tm_score(coordinates1, coordinates2, sequence1, sequence2):
"""Calculate TM-score between two structures.
Parameters
----------
coordinates1 : np.ndarray
The CA coordinates array of the first structure, shaped `(L, 3)`
coordinates2 : ProteinEntry
The CA coordinates array of the second structure, shaped `(L, 3)`
sequence1 : str
The sequence of the first structure
sequence2 : str
The sequence of the second structure
Returns
-------
tm_score : float
The TM-score between the two structures
"""
res = tm_align(coordinates1, coordinates2, sequence1, sequence2)
return (res.tm_norm_chain1 + res.tm_norm_chain2) / 2
@requires_extra("esm", install_name="fair-esm[esmfold]")
def esmfold_generate(sequences, filepaths=None):
"""Generate PDB structures using ESMFold.
Note that you need to install `fair-esm` with the `esmfold` option (see https://github.com/facebookresearch/esm/tree/main).
The model also requires > 16GB CPU and GPU memory.
Parameters
----------
sequences : list of str
List of sequences to be generated (chains separated with `':'`)
filepaths : list of str, default None
List of filepaths for the generated structures
"""
assert filepaths is None or len(filepaths) == len(sequences)
print("Loading the ESMFold model...")
model = esm.pretrained.esmfold_v1()
model = model.eval().cuda()
print("Model loaded.")
if filepaths is None:
if not os.path.exists("esmfold_output"):
os.mkdir("esmfold_output")
filepaths = [
os.path.join("esmfold_output", f"seq_{i}.pdb")
for i in range(len(sequences))
]
with torch.no_grad():
for sequence, path in tqdm(zip(sequences, filepaths), total=len(sequences)):
output = model.infer_pdb(sequence)
with open(path, "w") as f:
f.write(output)
@requires_extra("igfold")
def igfold_generate(sequence_dicts, filepaths=None, use_openmm=False):
"""Generate PDB structures using IgFold.
Note that you need to install `igfold` (see https://github.com/Graylab/IgFold).
Parameters
----------
sequence_dicts : list of dict
List of sequence dictionaries (keys: "H", "L" for heavy and light chains)
filepaths : list of str, optional
List of filepaths for the generated structures
use_openmm : bool, default False
Whether to use refinement with OpenMM
"""
assert filepaths is None or len(filepaths) == len(sequence_dicts)
igfold = IgFoldRunner()
folder = "igfold_refine_output" if use_openmm else "igfold_output"
if filepaths is None:
if not os.path.exists(folder):
os.mkdir(folder)
filepaths = [
os.path.join(folder, f"seq_{i}.pdb") for i in range(len(sequence_dicts))
]
for seqs, path in tqdm(zip(sequence_dicts, filepaths), total=len(sequence_dicts)):
igfold.fold(
path, # Output PDB file
sequences=seqs, # Antibody sequences
do_refine=use_openmm, # Refine the antibody structure
use_openmm=use_openmm, # Use OpenMM for refinement
do_renum=False, # Renumber predicted antibody structure (Chothia)
)
@requires_extra("ImmuneBuilder")
def immunebuilder_generate(sequence_dicts, filepaths=None, protein_type="antibody"):
"""Generate PDB structures using ImmuneBuilder.
Note that you need to install `immunebuilder` (see https://github.com/oxpig/ImmuneBuilder)
Parameters
----------
sequence_dicts : list of dict
List of sequence dictionaries (keys: "H", "L" for heavy and light chains)
filepaths : list of str, optional
List of filepaths for the generated structures
protein_type: {"antibody", "nanobody", "tcr"}
Type of the structure to generate
"""
predictor_classes = {
"antibody": ABodyBuilder2,
"nanobody": NanoBodyBuilder2,
"tcr": TCRBuilder2,
}
predictor = predictor_classes[protein_type]()
folder = "immunebuilder_output"
if filepaths is None:
if not os.path.exists(folder):
os.mkdir(folder)
filepaths = [
os.path.join(folder, f"seq_{i}.pdb") for i in range(len(sequence_dicts))
]
for seqs, path in tqdm(zip(sequence_dicts, filepaths), total=len(sequence_dicts)):
out = predictor.predict(seqs)
out.save(path)
def confidence_from_file(filepath, predict_mask=None):
"""Get the average pLDDT or pRMSD score of a structure generated with ESMFold or IgFold.
This function loads the metric that is stored in the B-factor column of the PDB file.
For files generated with ESMFold, the metric is pLDDT; for IgFold and ImmuneBuilder, the metric is pRMSD.
Parameters
----------
filepath : str
Filepath of the structure
predict_mask : np.ndarray, default None
Predict mask of the structure (1 indicates a predicted residue, 0 otherwise)
Returns
-------
confidence : float
Average PLDDT / pRMSD score of the structure
"""
struct = bsio.load_structure(filepath, extra_fields=["b_factor"])
if predict_mask is not None:
order = -1
order_array = []
for i in range(len(struct)):
if struct[i].atom_name == "N":
order += 1
order_array.append(order)
b_factor = [
atom.b_factor
for order, atom in zip(order_array, struct)
if predict_mask[order] == 1
]
return np.array(b_factor).mean()
else:
return struct.b_factor.mean()
Functions
def ablang_pll(sequence, predict_mask, ablang_model_name='heavy', average=False)
-
Compute pseudo log likelihood.
Note that you need to install
ablang
(see https://github.com/oxpig/AbLang/tree/main).Parameters
sequence
:str
- Chain sequence (string of amino acid codes)
predict_mask
:np.ndarray
- Predict mask corresponding to the sequence (array of 0 and 1 where 1 indicates a predicted residue)
ablang_model_name
:{"heavy", "light"}
, default"heavy"
- Name of the AbLang model to use
average
:bool
, defaultFalse
- Whether to average the pseudo log likelihood over the residues
Returns
pll
:float
- Pseudo log likelihood
Expand source code
@requires_extra("ablang") def ablang_pll( sequence, predict_mask, ablang_model_name="heavy", average=False, ): """Compute pseudo log likelihood. Note that you need to install `ablang` (see https://github.com/oxpig/AbLang/tree/main). Parameters ---------- sequence : str Chain sequence (string of amino acid codes) predict_mask : np.ndarray Predict mask corresponding to the sequence (array of 0 and 1 where 1 indicates a predicted residue) ablang_model_name : {"heavy", "light"}, default "heavy" Name of the AbLang model to use average : bool, default False Whether to average the pseudo log likelihood over the residues Returns ------- pll: float Pseudo log likelihood """ ablang_model = ablang.pretrained( ablang_model_name ) # Use "light" if you are working with light chains ablang_model.freeze() sequences = [] sequence = list(sequence) predict_idx = np.where(predict_mask)[0] for i in predict_idx: sequences.append("".join(sequence[:i]) + "*" + "".join(sequence[i + 1 :])) logits = ablang_model(sequences, mode="likelihood")[:, 1:] exp_logits = np.exp(logits) prob = exp_logits / exp_logits.sum(axis=-1, keepdims=True) true_idx = [ ablang_model.tokenizer.vocab_to_token[x] - 1 for x in np.array(sequence)[predict_idx] ] prob = prob[range(prob.shape[0]), predict_idx, true_idx] pll = np.log(prob).sum() if average: pll /= len(predict_idx) return pll
def blosum62_score(seq_before, seq_after)
-
Calculate the BLOSUM62 score between two sequences.
Parameters
seq_before
:str
- The sequence before the mutation
seq_after
:str
- The sequence after the mutation
Returns
score
:int
- The BLOSUM62 score between the two sequences
Expand source code
@requires_extra("blosum") def blosum62_score(seq_before, seq_after): """Calculate the BLOSUM62 score between two sequences. Parameters ---------- seq_before : str The sequence before the mutation seq_after : str The sequence after the mutation Returns ------- score : int The BLOSUM62 score between the two sequences """ assert len(seq_before) == len(seq_after) matrix = bl.BLOSUM(62) score = 0 for x_before, x_after in zip(seq_before, seq_after): score += matrix[x_before][x_after] return score
def ca_rmsd(coordinates1, coordinates2)
-
Calculate CA RMSD between two structures.
Parameters
coordinates1
:np.ndarray
- The CA coordinates array of the first structure, shaped
(L, 3)
coordinates2
:ProteinEntry
- The CA coordinates array of the second structure, shaped
(L, 3)
Returns
rmsd
:float
- The RMSD between the two structures
Expand source code
def ca_rmsd(coordinates1, coordinates2): """Calculate CA RMSD between two structures. Parameters ---------- coordinates1 : np.ndarray The CA coordinates array of the first structure, shaped `(L, 3)` coordinates2 : ProteinEntry The CA coordinates array of the second structure, shaped `(L, 3)` Returns ------- rmsd : float The RMSD between the two structures """ return np.sqrt(((coordinates1 - coordinates2) ** 2).sum(axis=-1).mean())
def confidence_from_file(filepath, predict_mask=None)
-
Get the average pLDDT or pRMSD score of a structure generated with ESMFold or IgFold.
This function loads the metric that is stored in the B-factor column of the PDB file. For files generated with ESMFold, the metric is pLDDT; for IgFold and ImmuneBuilder, the metric is pRMSD.
Parameters
filepath
:str
- Filepath of the structure
predict_mask
:np.ndarray
, defaultNone
- Predict mask of the structure (1 indicates a predicted residue, 0 otherwise)
Returns
confidence
:float
- Average PLDDT / pRMSD score of the structure
Expand source code
def confidence_from_file(filepath, predict_mask=None): """Get the average pLDDT or pRMSD score of a structure generated with ESMFold or IgFold. This function loads the metric that is stored in the B-factor column of the PDB file. For files generated with ESMFold, the metric is pLDDT; for IgFold and ImmuneBuilder, the metric is pRMSD. Parameters ---------- filepath : str Filepath of the structure predict_mask : np.ndarray, default None Predict mask of the structure (1 indicates a predicted residue, 0 otherwise) Returns ------- confidence : float Average PLDDT / pRMSD score of the structure """ struct = bsio.load_structure(filepath, extra_fields=["b_factor"]) if predict_mask is not None: order = -1 order_array = [] for i in range(len(struct)): if struct[i].atom_name == "N": order += 1 order_array.append(order) b_factor = [ atom.b_factor for order, atom in zip(order_array, struct) if predict_mask[order] == 1 ] return np.array(b_factor).mean() else: return struct.b_factor.mean()
def esm_pll(chain_sequences, predict_masks, esm_model_name='esm2_t30_150M_UR50D', esm_model_objects=None, average=False)
-
Compute pseudo log likelihood.
Parameters
chain_sequences
:list
ofstr
- List of chain sequences (strings of amino acid codes)
predict_masks
:list
ofnp.ndarray
- List of predict masks corresponding to the sequences (arrays of 0 and 1 where 1 indicates a predicted residue)
esm_model_name
:str
, default"esm2_t30_150M_UR50D"
- Name of the ESM-2 model to use
esm_model_objects
:tuple
, optional- Tuple of ESM-2 model, batch converter and tok_to_idx dictionary (if not None,
esm_model_name
will be ignored) average
:bool
, defaultFalse
- Whether to average the pseudo log likelihood over the residues
Returns
pll
:float
- Pseudo log likelihood
Expand source code
@requires_extra("esm", install_name="fair-esm") def esm_pll( chain_sequences, predict_masks, esm_model_name="esm2_t30_150M_UR50D", esm_model_objects=None, average=False, ): """Compute pseudo log likelihood. Parameters ---------- chain_sequences : list of str List of chain sequences (strings of amino acid codes) predict_masks : list of np.ndarray List of predict masks corresponding to the sequences (arrays of 0 and 1 where 1 indicates a predicted residue) esm_model_name : str, default "esm2_t30_150M_UR50D" Name of the ESM-2 model to use esm_model_objects : tuple, optional Tuple of ESM-2 model, batch converter and tok_to_idx dictionary (if not None, `esm_model_name` will be ignored) average : bool, default False Whether to average the pseudo log likelihood over the residues Returns ------- pll: float Pseudo log likelihood """ predict_mask = [] for mask in predict_masks: predict_mask.append(mask) predict_mask.append(np.zeros(2)) predict_mask = np.concatenate(predict_mask, axis=0) predict_idx = np.where(predict_mask)[0] sequence = [] for i, seq in enumerate(chain_sequences): sequence += list(seq) if i != len(chain_sequences) - 1: sequence += ["<eos>", "<cls>"] if esm_model_objects is None: esm_model, batch_converter, tok_to_idx = _get_esm_model(esm_model_name) else: esm_model, batch_converter, tok_to_idx = esm_model_objects pll = 0 for i in predict_idx: sequence_ = "".join(sequence[:i]) + "<mask>" + "".join(sequence[i + 1 :]) _, _, batch_tokens = batch_converter([(0, sequence_)]) if torch.cuda.is_available(): batch_tokens = batch_tokens.to("cuda") with torch.no_grad(): results = esm_model(batch_tokens, repr_layers=[6], return_contacts=False) logits = results["logits"][0, i + 1].detach().cpu() tok_idx = tok_to_idx[sequence[i]] prob = F.softmax(logits[4:24], dim=-1)[tok_idx - 4] pll += torch.log(prob).item() if average: pll /= len(predict_idx) return pll
def esmfold_generate(sequences, filepaths=None)
-
Generate PDB structures using ESMFold.
Note that you need to install
fair-esm
with theesmfold
option (see https://github.com/facebookresearch/esm/tree/main). The model also requires > 16GB CPU and GPU memory.Parameters
sequences
:list
ofstr
- List of sequences to be generated (chains separated with
':'
) filepaths
:list
ofstr
, defaultNone
- List of filepaths for the generated structures
Expand source code
@requires_extra("esm", install_name="fair-esm[esmfold]") def esmfold_generate(sequences, filepaths=None): """Generate PDB structures using ESMFold. Note that you need to install `fair-esm` with the `esmfold` option (see https://github.com/facebookresearch/esm/tree/main). The model also requires > 16GB CPU and GPU memory. Parameters ---------- sequences : list of str List of sequences to be generated (chains separated with `':'`) filepaths : list of str, default None List of filepaths for the generated structures """ assert filepaths is None or len(filepaths) == len(sequences) print("Loading the ESMFold model...") model = esm.pretrained.esmfold_v1() model = model.eval().cuda() print("Model loaded.") if filepaths is None: if not os.path.exists("esmfold_output"): os.mkdir("esmfold_output") filepaths = [ os.path.join("esmfold_output", f"seq_{i}.pdb") for i in range(len(sequences)) ] with torch.no_grad(): for sequence, path in tqdm(zip(sequences, filepaths), total=len(sequences)): output = model.infer_pdb(sequence) with open(path, "w") as f: f.write(output)
def igfold_generate(sequence_dicts, filepaths=None, use_openmm=False)
-
Generate PDB structures using IgFold.
Note that you need to install
igfold
(see https://github.com/Graylab/IgFold).Parameters
sequence_dicts
:list
ofdict
- List of sequence dictionaries (keys: "H", "L" for heavy and light chains)
filepaths
:list
ofstr
, optional- List of filepaths for the generated structures
use_openmm
:bool
, defaultFalse
- Whether to use refinement with OpenMM
Expand source code
@requires_extra("igfold") def igfold_generate(sequence_dicts, filepaths=None, use_openmm=False): """Generate PDB structures using IgFold. Note that you need to install `igfold` (see https://github.com/Graylab/IgFold). Parameters ---------- sequence_dicts : list of dict List of sequence dictionaries (keys: "H", "L" for heavy and light chains) filepaths : list of str, optional List of filepaths for the generated structures use_openmm : bool, default False Whether to use refinement with OpenMM """ assert filepaths is None or len(filepaths) == len(sequence_dicts) igfold = IgFoldRunner() folder = "igfold_refine_output" if use_openmm else "igfold_output" if filepaths is None: if not os.path.exists(folder): os.mkdir(folder) filepaths = [ os.path.join(folder, f"seq_{i}.pdb") for i in range(len(sequence_dicts)) ] for seqs, path in tqdm(zip(sequence_dicts, filepaths), total=len(sequence_dicts)): igfold.fold( path, # Output PDB file sequences=seqs, # Antibody sequences do_refine=use_openmm, # Refine the antibody structure use_openmm=use_openmm, # Use OpenMM for refinement do_renum=False, # Renumber predicted antibody structure (Chothia) )
def immunebuilder_generate(sequence_dicts, filepaths=None, protein_type='antibody')
-
Generate PDB structures using ImmuneBuilder.
Note that you need to install
immunebuilder
(see https://github.com/oxpig/ImmuneBuilder)Parameters
sequence_dicts
:list
ofdict
- List of sequence dictionaries (keys: "H", "L" for heavy and light chains)
filepaths
:list
ofstr
, optional- List of filepaths for the generated structures
protein_type
:{"antibody", "nanobody", "tcr"}
- Type of the structure to generate
Expand source code
@requires_extra("ImmuneBuilder") def immunebuilder_generate(sequence_dicts, filepaths=None, protein_type="antibody"): """Generate PDB structures using ImmuneBuilder. Note that you need to install `immunebuilder` (see https://github.com/oxpig/ImmuneBuilder) Parameters ---------- sequence_dicts : list of dict List of sequence dictionaries (keys: "H", "L" for heavy and light chains) filepaths : list of str, optional List of filepaths for the generated structures protein_type: {"antibody", "nanobody", "tcr"} Type of the structure to generate """ predictor_classes = { "antibody": ABodyBuilder2, "nanobody": NanoBodyBuilder2, "tcr": TCRBuilder2, } predictor = predictor_classes[protein_type]() folder = "immunebuilder_output" if filepaths is None: if not os.path.exists(folder): os.mkdir(folder) filepaths = [ os.path.join(folder, f"seq_{i}.pdb") for i in range(len(sequence_dicts)) ] for seqs, path in tqdm(zip(sequence_dicts, filepaths), total=len(sequence_dicts)): out = predictor.predict(seqs) out.save(path)
def long_repeat_num(seq, thr=5)
-
Calculate the number of long repeats in a sequence.
Parameters
seq
:str
- The sequence to be evaluated.
thr
:int
, default5
- The threshold of the length of a repeat.
Returns
num
:int
- The number of long repeats in the sequence.
Expand source code
@requires_extra("blosum") def long_repeat_num(seq, thr=5): """Calculate the number of long repeats in a sequence. Parameters ---------- seq : str The sequence to be evaluated. thr : int, default 5 The threshold of the length of a repeat. Returns ------- num : int The number of long repeats in the sequence. """ arr = np.array(list(seq)) # Find the indices where the array changes its value changes = np.flatnonzero(arr[:-1] != arr[1:]) # Split the array into consecutive groups groups = np.split(arr, changes + 1) # Filter groups that are longer than N long_groups = filter(lambda g: len(g) > thr, groups) # Count the number of long groups count = sum(1 for _ in long_groups) return count
def tm_score(coordinates1, coordinates2, sequence1, sequence2)
-
Calculate TM-score between two structures.
Parameters
coordinates1
:np.ndarray
- The CA coordinates array of the first structure, shaped
(L, 3)
coordinates2
:ProteinEntry
- The CA coordinates array of the second structure, shaped
(L, 3)
sequence1
:str
- The sequence of the first structure
sequence2
:str
- The sequence of the second structure
Returns
tm_score
:float
- The TM-score between the two structures
Expand source code
@requires_extra("tmtools") def tm_score(coordinates1, coordinates2, sequence1, sequence2): """Calculate TM-score between two structures. Parameters ---------- coordinates1 : np.ndarray The CA coordinates array of the first structure, shaped `(L, 3)` coordinates2 : ProteinEntry The CA coordinates array of the second structure, shaped `(L, 3)` sequence1 : str The sequence of the first structure sequence2 : str The sequence of the second structure Returns ------- tm_score : float The TM-score between the two structures """ res = tm_align(coordinates1, coordinates2, sequence1, sequence2) return (res.tm_norm_chain1 + res.tm_norm_chain2) / 2