Skip to content

Commit

Permalink
Add hallmark residues, alignment names, fix kabat 82A rare case
Browse files Browse the repository at this point in the history
  • Loading branch information
prihoda committed Dec 11, 2024
1 parent ab068d6 commit 6f0f685
Show file tree
Hide file tree
Showing 6 changed files with 76 additions and 18 deletions.
2 changes: 1 addition & 1 deletion abnumber/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.4.1'
__version__ = '0.4.2'
29 changes: 20 additions & 9 deletions abnumber/alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,15 @@ class Alignment:
...
"""
def __init__(self, positions, residues, scheme, chain_type):
def __init__(self, positions, residues, scheme, chain_type, names=None):
assert isinstance(positions, list), 'Expected list of positions and residues. ' \
'Use chain.align(other) to create an alignment.'
assert len(positions) == len(residues)
unique_cdr_definitions = set(pos.cdr_definition for pos in positions)
assert len(unique_cdr_definitions) <= 1, f'Aligned chains should use the same CDR definitions, got: {unique_cdr_definitions}'
num_seqs = len(residues[0])
self.names = names if names is not None else ([None] * num_seqs)
assert len(self.names) == num_seqs, 'Number of names should match number of sequences'
self.positions = positions
self.residues = residues
self.scheme = scheme
Expand Down Expand Up @@ -90,7 +93,7 @@ def slice(self, start: Union[str, int, 'Position'] = None, stop: Union[str, int,
new_positions.append(pos)
new_residues.append(residues)

return Alignment(positions=new_positions, residues=new_residues, scheme=self.scheme, chain_type=self.chain_type)
return Alignment(positions=new_positions, residues=new_residues, scheme=self.scheme, chain_type=self.chain_type, names=self.names)

def _parse_position(self, position: Union[int, str, 'Position'], allow_raw=False):
"""Create :class:`Position` key object from string or int.
Expand All @@ -116,27 +119,30 @@ def _parse_position(self, position: Union[int, str, 'Position'], allow_raw=False
return None
return self.positions[position]

def format(self, mark_identity=True, mark_cdrs=True):
def format(self, mark_identity=True, mark_cdrs=True, names=False):
"""Format alignment to string
:param mark_identity: Add BLAST style middle line showing identity (``|``), similar residue (``+``) or different residue (``.``)
:param mark_cdrs: Add line highlighting CDR regions using ``^``
:param names: Add chain.name to the beginning of each sequence
:return: formatted string
"""

def _identity_symbol(a, b):
return '|' if a == b else ('+' if is_similar_residue(a, b) else '.')

lines = []
longest_name = max(len(name or '') for name in self.names) if hasattr(self, 'names') and self.names else 0
for i in range(len(self.residues[0])):
name = self.names[i] if hasattr(self, 'names') and self.names else None
if mark_identity and i != 0:
lines.append(''.join(_identity_symbol(aas[i], aas[i-1]) for pos, aas in self))
lines.append(''.join(aas[i] for pos, aas in self))
lines.append((' '*(longest_name+1) if names else '') + ''.join(_identity_symbol(aas[i], aas[i-1]) for pos, aas in self))
lines.append(((name or '').rjust(longest_name) + ' ' if names else '') + ''.join(aas[i] for pos, aas in self))
if mark_cdrs:
if self.positions[0].cdr_definition == 'kabat':
lines.append(''.join('^' if pos.is_in_cdr() else ("°" if pos.is_in_vernier() else ' ') for pos in self.positions))
lines.append((' '*(longest_name+1) if names else '') + ''.join('^' if pos.is_in_cdr() else ("°" if pos.is_in_vernier() else ' ') for pos in self.positions))
else:
lines.append(''.join('^' if pos.is_in_cdr() else ' ' for pos in self.positions))
lines.append((' '*(longest_name+1) if names else '') + ''.join('^' if pos.is_in_cdr() else ' ' for pos in self.positions))
return '\n'.join(lines)

def print(self, mark_identity=True, mark_cdrs=True):
Expand Down Expand Up @@ -164,8 +170,13 @@ def num_mutations(self):
"""Get number of mutations (positions with more than one type of residue)"""
return sum(len(set(aas)) != 1 for aas in self.residues)

def num_identical(self):
"""Get number of positions with identical residues"""
def num_identical(self, ignore_cdrs = False):
"""Get number of positions with identical residues
:param ignore_cdrs: Ignore CDR regions when counting identical residues
"""
if ignore_cdrs:
return sum(len(set(aas)) == 1 for pos, aas in zip(self.positions, self.residues) if not pos.is_in_cdr())
return sum(len(set(aas)) == 1 for aas in self.residues)

def num_similar(self):
Expand Down
18 changes: 11 additions & 7 deletions abnumber/chain.py
Original file line number Diff line number Diff line change
Expand Up @@ -545,7 +545,7 @@ def align(self, *other) -> 'Alignment':

shared_pos = sorted(set(pos for pos_dict in pos_dicts for pos in pos_dict.keys()))
residues = [tuple(pos_dict.get(pos, '-') for pos_dict in pos_dicts) for pos in shared_pos]
return Alignment(shared_pos, residues, chain_type=self.chain_type, scheme=self.scheme)
return Alignment(shared_pos, residues, chain_type=self.chain_type, scheme=self.scheme, names=[self.name] + [chain.name for chain in other])

def clone(self, replace_seq: str = None):
"""Create a copy of this chain, optionally with a replacement sequence
Expand Down Expand Up @@ -613,11 +613,12 @@ def renumber(self, scheme=None, cdr_definition=None, allowed_species=None):
assign_germline=self.v_gene is not None
)

def graft_cdrs_onto(self, other: 'Chain', backmutate_vernier=False, backmutations: List[Union['Position',str]] = [], name: str = None) -> 'Chain':
def graft_cdrs_onto(self, other: 'Chain', backmutate_vernier=False, backmutate_vhh_hallmark=False, backmutations: List[Union['Position',str]] = [], name: str = None) -> 'Chain':
"""Graft CDRs from this Chain onto another chain
:param other: Chain to graft CDRs into (source of frameworks and tail sequence)
:param backmutate_vernier: Also graft all Kabat Vernier positions from this chain (perform backmutations)
:param backmutate_vhh_hallmark: Also graft Kabat 37, 44, 45, and 47 from this chain (VHH Hallmark residues)
:param backmutations: List of positions that should additionally be grafted from this chain (str or or :class:`Position`)
:param name: Name of new Chain. If not provided, use name of this chain.
:return: Chain with CDRs grafted from this chain and frameworks from the given chain
Expand All @@ -638,6 +639,7 @@ def graft_cdrs_onto(self, other: 'Chain', backmutate_vernier=False, backmutation
continue
if pos.is_in_cdr() \
or (backmutate_vernier and pos.is_in_vernier()) \
or (backmutate_vhh_hallmark and pos.is_vhh_hallmark()) \
or pos in backmutations \
or grafted_dict.get(pos) == 'X':
grafted_dict[pos] = aa
Expand Down Expand Up @@ -691,13 +693,14 @@ def get_position_by_raw_index(self, index):
"""Get Position object at corresponding raw numeric position"""
return list(self.positions.keys())[index]

def find_human_germlines(self, limit=10, v_gene=None, j_gene=None, unique=True) -> Tuple[List['Chain'], List['Chain']]:
def find_human_germlines(self, limit=10, v_gene=None, j_gene=None, unique=True, rank_by_frameworks=False) -> Tuple[List['Chain'], List['Chain']]:
"""Find most identical V and J germline sequences based on IMGT alignment
:param limit: Number of best matching germlines to return
:param v_gene: Filter germlines to specific V gene name
:param j_gene: Filter germlines to specific J gene name
:param unique: Skip germlines with duplicate amino acid sequence
:param rank_by_frameworks: Prioritize framework sequence identity when finding nearest matches
:return: list of top V chains, list of top J chains
"""
from abnumber.germlines import get_imgt_v_chains, get_imgt_j_chains
Expand Down Expand Up @@ -727,24 +730,25 @@ def find_human_germlines(self, limit=10, v_gene=None, j_gene=None, unique=True)
j_chains = _get_unique_chains(j_chains)

v_alignments = [chain.align(germline) for germline in v_chains]
v_ranks = np.array([-alignment.num_identical() - (alignment.num_similar() * 0.01) for alignment in v_alignments]).argsort(kind='stable')[:limit]
v_ranks = np.array([-alignment.num_identical(ignore_cdrs=rank_by_frameworks) - (alignment.num_similar() * 0.01) for alignment in v_alignments]).argsort(kind='stable')[:limit]
top_v_chains = [v_chains[r] for r in v_ranks]

j_alignments = [chain.align(germline) for germline in j_chains]
j_ranks = np.array([-alignment.num_identical() - (alignment.num_similar() * 0.01) for alignment in j_alignments]).argsort(kind='stable')[:limit]
j_ranks = np.array([-alignment.num_identical(ignore_cdrs=rank_by_frameworks) - (alignment.num_similar() * 0.01) for alignment in j_alignments]).argsort(kind='stable')[:limit]
top_j_chains = [j_chains[r] for r in j_ranks]

return top_v_chains, top_j_chains

def find_merged_human_germline(self, top=0, v_gene=None, j_gene=None) -> 'Chain':
def find_merged_human_germline(self, top=0, v_gene=None, j_gene=None, rank_by_frameworks=False) -> 'Chain':
"""Find n-th most identical V and J germline sequence based on IMGT alignment and merge them into one Chain
:param top: Return top N most identical germline (0-indexed)
:param v_gene: Filter germlines to specific V gene name
:param j_gene: Filter germlines to specific J gene name
:param rank_by_frameworks: Prioritize framework sequence identity when finding nearest matches
:return: merged germline sequence Chain object
"""
v_chains, j_chains = self.find_human_germlines(limit=top+1, v_gene=v_gene, j_gene=j_gene)
v_chains, j_chains = self.find_human_germlines(limit=top+1, v_gene=v_gene, j_gene=j_gene, rank_by_frameworks=rank_by_frameworks)
v_chain = v_chains[top]
j_chain = j_chains[top]
return v_chain.merge(j_chain)
Expand Down
21 changes: 21 additions & 0 deletions abnumber/common.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import sys
import warnings
from typing import List, Tuple
import re
import numpy as np
Expand Down Expand Up @@ -42,6 +43,20 @@ def _anarci_align(sequences, scheme, allowed_species, assign_germline=False) ->
species = ali['species']
v_gene = ali['germlines']['v_gene'][0][1] if assign_germline else None
j_gene = ali['germlines']['j_gene'][0][1] if assign_germline else None

# Fix ANARCI bug returning same position number for consecutive positions, e.g. 82A, see test case in test_bugs.py
existing_positions = set()
for j, ((num, letter), aa) in enumerate(positions):
if aa == '-':
continue
if (num, letter) in existing_positions:
old_letter = letter
while (num, letter) in existing_positions:
letter = chr(ord(letter) + 1)
positions[j] = ((num, letter), aa)
# create UserWarning
warnings.warn(f'ANARCI returned duplicate position number "{num}{old_letter}", using "{num}{letter}" in sequence "{sequence}"', UserWarning)
existing_positions.add((num, letter))
aa_dict = {Position(chain_type=chain_type, number=num, letter=letter, scheme=scheme): aa
for (num, letter), aa in positions if aa != '-'}
next_start = None if i == len(seq_numbered) - 1 else seq_numbered[i+1][1]
Expand Down Expand Up @@ -133,5 +148,11 @@ def is_integer(object):
'kabat_L': frozenset([2, 4, 35, 36, 46, 47, 48, 49, 64, 66, 68, 69, 71, 98]),
}

SCHEME_HALLMARK = {
'kabat_H': frozenset([37, 44, 45, 47]),
'kabat_K': frozenset([]),
'kabat_L': frozenset([]),
}

#'kabat_H': 31-35, 50-65, 95-102
#'kabat_K': 24-34, 50-56, 89-97
11 changes: 10 additions & 1 deletion abnumber/position.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import copy
from typing import List, Union

from abnumber.common import _validate_chain_type, SCHEME_POSITION_TO_REGION, SCHEME_VERNIER, POS_REGEX
from abnumber.common import _validate_chain_type, SCHEME_POSITION_TO_REGION, SCHEME_VERNIER, POS_REGEX, SCHEME_HALLMARK


class Position:
Expand Down Expand Up @@ -124,6 +124,15 @@ def is_in_vernier(self):
raise NotImplementedError('Vernier zone identification is currently supported '
f'only with Kabat numbering or CDR definitions, got: {self.scheme}+{self.cdr_definition}')

def is_vhh_hallmark(self):
if self.scheme == 'kabat':
return self.number in SCHEME_HALLMARK.get(f'{self.scheme}_{self.chain_type}', [])
elif self.cdr_definition == 'kabat':
return self.cdr_definition_position in SCHEME_HALLMARK.get(f'{self.cdr_definition}_{self.chain_type}', [])
else:
raise NotImplementedError('Hallmark zone identification is currently supported '
f'only with Kabat numbering or CDR definitions, got: {self.scheme}+{self.cdr_definition}')

@classmethod
def from_string(cls, position, chain_type, scheme):
"""Create Position object from string, e.g. "H5"
Expand Down
13 changes: 13 additions & 0 deletions test/test_bugs.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,19 @@ def test_imgt_33A():
assert chain.seq == seq


def test_kabat_insertion():
# QVQLVESGGGLVQAGGSLSLSCAYSDSGRAFATVVMAWFRQPPGKDRDFVAGIRRSTNTYYADSVKGRFTISRDNAKNTVYLHINLKPEDTAVYYCAAXXXXXXXXXXXXXXAWGQGTQVTVSS
# x

seq = 'QVQLVESGGGLVQAGGSLSLSCAYSDSGRAFATVVMAWFRQPPGKDRDFVAGIRRSTNTYYADSVKGRFTISRDNAKNTVYLHINLKPEDTAVYYCAAXXXXXXXXXXXXXXAWGQGTQVTVSS'
chain = Chain(seq, 'kabat')
assert chain.seq == seq

seq = 'QVQLVESGGGLVQAGGSLSLSCAYSDSGRAFATVVMAWFRQPPGKDRDFVAGIRRSTNTYYADSVKGRFTISRDNAKNTVYLHINALKPEDTAVYYCAAXXXXXXXXXXXXXXAWGQGTQVTVSS'
chain = Chain(seq, 'kabat')
assert chain.seq == seq


def test_kappa_imgt_21():
from anarci import run_hmmer
sequence = 'DVVMTQSPLSLPVTLGQPASISCRSSQSLVYSDGNTYLNWFQQRPGQSPRRLIYKVSNRDSGVPDRFSGSGSGTDFTLKISRVEAEDVGVYYCMQGTFGQGTKVEIK'
Expand Down

0 comments on commit 6f0f685

Please sign in to comment.