Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: allele rle normalization + pin pydantic version #234

Merged
merged 16 commits into from
Aug 30, 2023
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ install_requires =
jsonschema>=4.17.3
numpy
pyyaml
pydantic>=2.0.3
pydantic == 2.1.1
setup_requires =
cython
pytest-runner
Expand Down
213 changes: 172 additions & 41 deletions src/ga4gh/vrs/normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,60 +3,192 @@
See https://vrs.ga4gh.org/en/stable/impl-guide/normalization.html

"""

import logging
from enum import IntEnum
from typing import NamedTuple, Optional, Union

from bioutils.normalize import normalize as _normalize, NormalizationMode
from ga4gh.core import is_pydantic_instance, ga4gh_digest, pydantic_copy

from ._internal import models
from .dataproxy import SequenceProxy

_logger = logging.getLogger(__name__)

class PosType(IntEnum):
"""Define the kind of position on a location"""

INTEGER = 0
RANGE_LT_OR_EQUAL = 1
RANGE_GT_OR_EQUAL = 2


class LocationPos(NamedTuple):
"""Define Allele location pos value and type"""

value: int
pos_type: PosType

def _normalize_allele(input_allele, data_proxy):

def _get_allele_location_pos(
allele_vo: models.Allele, use_start: bool = True
) -> Optional[LocationPos]:
"""Get Allele location start or end value for interval
korikuzma marked this conversation as resolved.
Show resolved Hide resolved

:param allele_vo: VRS Allele object
:param use_start: `True` if using `allele_vo.location.start`. `False` if using
`allele_vo.location.end`
:return: A `LocationPos` if using integer or indefinite range. Otherwise return
`None`
"""
Converts .location.sequence into an IRI if it is a SequenceReference because it makes the code simpler.
If it started as a sequence reference, put it back as one at the end.
if use_start:
pos = allele_vo.location.start
else:
pos = allele_vo.location.end

if isinstance(pos, int):
val = pos
pos_type = PosType.INTEGER
else:
pos0_is_none = pos.root[0] is None
pos1_is_none = pos.root[1] is None

if not pos0_is_none and not pos1_is_none: # definite range
return None

val = pos.root[0] or pos.root[1]
pos_type = PosType.RANGE_LT_OR_EQUAL if pos0_is_none else PosType.RANGE_GT_OR_EQUAL

return LocationPos(value=val, pos_type=pos_type)


def _get_new_allele_location_pos(
new_pos_val: int, pos_type: PosType
) -> Union[int, models.Range]:
"""Get updated location pos on normalized allele

:param new_pos_val: New position after normalization
:param pos_type: Original position type used in pre-normalized VRS Allele object
:return: Updated position as integer or VRS Range object
"""
if pos_type == PosType.INTEGER:
val = new_pos_val
elif pos_type == PosType.RANGE_LT_OR_EQUAL:
val = models.Range([None, new_pos_val])
else:
val = models.Range([new_pos_val, None])
return val


def _normalize_allele(input_allele, data_proxy, rle_seq_limit=50):
"""Normalize Allele using "fully-justified" normalization adapted from NCBI's
VOCA. Fully-justified normalization expands such ambiguous representation over the
entire region of ambiguity, resulting in an unambiguous representation that may be
readily compared with other alleles.

:param input_allele: Input VRS Allele object
:param data_proxy: SeqRepo dataproxy
:param rle_seq_limit: If RLE is set as the new state, set the limit for the length
of the `sequence`.
To exclude `sequence` from the response, set to 0.
For no limit, set to `None`.
korikuzma marked this conversation as resolved.
Show resolved Hide resolved

Does not attempt to normalize Allele's with definite ranges. Will return the
korikuzma marked this conversation as resolved.
Show resolved Hide resolved
`input_allele`
"""
allele = pydantic_copy(input_allele)

# Temporarily convert SequenceReference to IRI because it makes the code simpler.
# This will be changed back to SequenceReference at the end of the method
sequence_reference = None
if isinstance(allele.location.sequence, models.SequenceReference):
sequence_reference = allele.location.sequence
allele.location.sequence = models.IRI(sequence_reference.refgetAccession)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not following this. An IRI isn't guaranteed to have the digest in it, so we should always assume a SequenceReference, or have a mechanism for resolving the IRI to get a SequenceReference. I might be missing something here, but why not simply remove this block and then revise line 107 to pull from the digest field expected in every object?

Suggested change
# Temporarily convert SequenceReference to IRI because it makes the code simpler.
# This will be changed back to SequenceReference at the end of the method
sequence_reference = None
if isinstance(allele.location.sequence, models.SequenceReference):
sequence_reference = allele.location.sequence
allele.location.sequence = models.IRI(sequence_reference.refgetAccession)

Copy link
Contributor Author

@korikuzma korikuzma Aug 25, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was initially done by @theferrit32 . @ahwagner is the digest expected in every VRS object? In the models.py, the digest is set as an optional field for all VRS objects.

I'm trying to figure out why this was done, but I'm now realizing that this is an issue if allele.location.sequence is not defined. A SequenceLocation.sequence is listed as an optional field. @ahwagner can you explain why this is not a required field?

Copy link
Member

@ahwagner ahwagner Aug 26, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ahwagner is the digest expected in every VRS object?

Yes, this is something that I think @theferrit32 was working on. It is available to every object, and our digest strategy in VRS 2 will compute this for every object, whether or not it is an identifiable object. I believe we were going to be using this for all objects in VRS-Python.

A SequenceLocation.sequence is listed as an optional field. @ahwagner can you explain why this is not a required field?

Yes, it is optional to use this attribute in JSON Schema, because when used in an Allele that is part of a Haplotype, the SequenceLocation.sequence can be omitted from VRS messages, as they (by definition) will match the sequence of the parent Haplotype object. However, it is required from the ga4ghDigest.keys for creating computed digests, because it is still a critical component of the value of a SequenceLocation. In those cases, it is expected that the system loading the VRS Haplotype object would refer to / copy over the Haplotype.sequence for the Haplotype.member[*].location.sequence properties.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ahwagner I think there might be some confusion regarding the digest field. At the moment, digest is optional and does not get computed when a VRS Object is created. Should this field be populated each time a user creates a VRS Object?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ahwagner is suggesting to switch this condition to IRI. Some test cases: #seqrefs/myseq123 and HTTPS://w3id.org/NM_012345

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We will need to dereference the #seqrefs/myseq123 outside this function because this function only receives the allele to be normalized, it won't have access to the full original document the allele came from, where #seqrefs/myseq123 can be resolved from.

We can create another function like ga4gh_inline (or something) that takes a JSON document, finds the GA4GH objects in it, and for any field whose value is an IRI that is a relative pointer, inline the object it points to in that field, if the document contains it.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we do that the type fields would then need to be required on the input JSON/dict. They are not currently required because the type fields are defined as literals and when you construct a particular class with some input it assumes it is that type and fills in the type field.

If we don't want this constraint, we could just traverse the input document and replace all field values that look like JSON pointers (not just those in fields that are defined as IRIs in the VRS models) with the objects they refer to. This would also let people use the JSON pointer thing in non-model fields. Like if someone has a statement and a custom field they added that isn't in the model, but they want to refer to the variant in the same document from there using a pointer.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@theferrit32 updated comments + tests with deferenced IRIs. Let me know if I need to make any more changes!


sequence = SequenceProxy(data_proxy, allele.location.sequence.root)
# Get reference sequence and interval
ref_seq = SequenceProxy(data_proxy, allele.location.sequence.root)
korikuzma marked this conversation as resolved.
Show resolved Hide resolved
start = _get_allele_location_pos(allele, use_start=True)
if start is None:
return input_allele

ival = (allele.location.start, allele.location.end)
end = _get_allele_location_pos(allele, use_start=False)
if end is None:
return input_allele

_allele_state = allele.state.type
_states_with_sequence = ["ReferenceLengthExpression", "LiteralSequenceExpression"]
if _allele_state in _states_with_sequence:
ival = (start.value, end.value)
if allele.state.sequence:
alleles = (None, allele.state.sequence.root)
elif _allele_state == "RepeatedSequenceExpression" and \
allele.state.seq_expr.type in _states_with_sequence:
alleles = (None, allele.state.seq_expr.sequence.root)
else:
alleles = (None, "")

new_allele = pydantic_copy(allele)

# Trim common flanking sequence from Allele sequences.
try:
new_ival, new_alleles = _normalize(sequence,
ival,
alleles=alleles,
mode=NormalizationMode.EXPAND,
anchor_length=0)
trim_ival, trim_alleles = _normalize(ref_seq, ival, alleles, mode=None, trim=True)
except ValueError:
# Occurs for ref agree Alleles (when alt = ref)
len_ref_seq = len_alt_seq = 0
else:
trim_ref_seq = ref_seq[trim_ival[0]: trim_ival[1]]
trim_alt_seq = trim_alleles[1]
len_ref_seq = len(trim_ref_seq)
len_alt_seq = len(trim_alt_seq)

new_allele.location.start = new_ival[0]
new_allele.location.end = new_ival[1]
# Compare the two allele sequences
if not len_ref_seq and not len_alt_seq:
return input_allele

if new_allele.state.type in _states_with_sequence:
new_allele.state.sequence = models.SequenceString(new_alleles[1])
new_allele = pydantic_copy(allele)

if len_ref_seq and len_alt_seq:
new_allele.location.start = _get_new_allele_location_pos(
trim_ival[0], start.pos_type
)
new_allele.location.end = _get_new_allele_location_pos(
trim_ival[1], end.pos_type
)
new_allele.state.sequence = models.SequenceString(trim_alleles[1])
if sequence_reference:
new_allele.location.sequence = sequence_reference
return new_allele

# Determine bounds of ambiguity
try:
new_ival, new_alleles = _normalize(
ref_seq,
trim_ival,
(None, trim_alleles[1]),
mode=NormalizationMode.EXPAND
)
except ValueError:
# Occurs for ref agree Alleles (when alt = ref)
pass
else:
new_allele.location.start = _get_new_allele_location_pos(
new_ival[0], start.pos_type
)
new_allele.location.end = _get_new_allele_location_pos(
new_ival[1], end.pos_type
)

new_ref_seq = ref_seq[new_ival[0]: new_ival[1]]

if not new_ref_seq:
# If the reference sequence is empty this is an unambiguous insertion.
# Return a new Allele with the trimmed alternate sequence as a Literal
# Sequence Expression
new_allele.state = models.LiteralSequenceExpression(
sequence=models.SequenceString(new_alleles[1])
)
else:
# Otherwise, return a new Allele using a RLE
sequence = models.SequenceString(new_alleles[1])
len_sequence = len(sequence.root)

new_allele.state = models.ReferenceLengthExpression(
length=len_sequence,
repeatSubunitLength=len_ref_seq or len_alt_seq
)

if (rle_seq_limit and len_sequence < rle_seq_limit) or (rle_seq_limit is None):
new_allele.state.sequence = sequence
korikuzma marked this conversation as resolved.
Show resolved Hide resolved

if sequence_reference:
new_allele.location.sequence = sequence_reference
Expand All @@ -73,27 +205,26 @@ def _normalize_haplotype(o, data_proxy=None):
return o


def _normalize_variationset(o, data_proxy=None):
o.members = sorted(o.members, key=ga4gh_digest)
return o


handlers = {
"Allele": _normalize_allele,
"Haplotype": _normalize_haplotype,
"VariationSet": _normalize_variationset,
}


def normalize(vo, data_proxy=None):
"""normalize given vrs object, regardless of type"""
def normalize(vo, data_proxy=None, **kwargs):
"""normalize given vrs object, regardless of type

kwargs:
rle_seq_limit: If RLE is set as the new state, set the limit for the length
of the `sequence`. To exclude `state.sequence`, set to 0.
"""

assert is_pydantic_instance(vo)
vo_type = vo.type

if vo_type in handlers:
handler = handlers[vo_type]
return handler(vo, data_proxy)
return handler(vo, data_proxy, **kwargs)

# No handler for vo_type; pass-through unchanged
return vo
Expand Down Expand Up @@ -122,13 +253,13 @@ def normalize(vo, data_proxy=None):
},
"type": "Allele"
}
allele = models.Allele(**allele_dict)
a = models.Allele(**allele_dict)

allele2 = normalize(allele, dp)
allele2 = normalize(a, dp)

allele.state.sequence = "C"
allele3 = normalize(allele, dp)
a.state.sequence.root = "C"
allele3 = normalize(a, dp)

allele.location.interval.end = 44908823
allele.state.sequence = ""
allele4 = normalize(allele, dp)
a.location.end = 44908823
a.state.sequence.root = ""
allele4 = normalize(a, dp)
Loading