diff --git a/src/api/routers/map.py b/src/api/routers/map.py index d53cc24..a657b41 100644 --- a/src/api/routers/map.py +++ b/src/api/routers/map.py @@ -1,4 +1,5 @@ """"Provide mapping router""" +import logging from pathlib import Path from cool_seq_tool.schemas import AnnotationLayer @@ -45,6 +46,8 @@ prefix="/api/v1", tags=["mappings"], responses={404: {"description": "Not found"}} ) +_logger = logging.getLogger(__name__) + @router.post(path="/map/{urn}", status_code=200, response_model=ScoresetMapping) @with_mavedb_score_set @@ -57,7 +60,7 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse try: metadata = get_scoreset_metadata(urn, store_path) records = get_scoreset_records(metadata, True, store_path) - metadata = patch_target_sequence_type(metadata, records) + metadata = patch_target_sequence_type(metadata, records, force=False) except ScoresetNotSupportedError as e: return JSONResponse( content=ScoresetMapping( diff --git a/src/api/server_main.py b/src/api/server_main.py index 0df2a50..cbff216 100644 --- a/src/api/server_main.py +++ b/src/api/server_main.py @@ -1,13 +1,26 @@ """FastAPI server file""" +import logging + import uvicorn from fastapi import FastAPI from api.routers import map +from dcd_mapping import dcd_mapping_version + +logging.basicConfig( + format="%(levelname)s:%(name)s:%(message)s", + level=logging.INFO, + force=True, +) +_logger = logging.getLogger(__name__) app = FastAPI() app.include_router(map.router) +msg = f"Starting DCD Mapping server v{dcd_mapping_version})" +_logger.info(msg) + # If the application is not already being run within a uvicorn server, start uvicorn here. if __name__ == "__main__": diff --git a/src/dcd_mapping/align.py b/src/dcd_mapping/align.py index 7f54b41..18325d5 100644 --- a/src/dcd_mapping/align.py +++ b/src/dcd_mapping/align.py @@ -3,6 +3,7 @@ import os import subprocess import tempfile +from collections.abc import Mapping from pathlib import Path from urllib.parse import urlparse @@ -19,7 +20,7 @@ ScoresetNotSupportedError, ) from dcd_mapping.lookup import get_chromosome_identifier, get_gene_location -from dcd_mapping.mavedb_data import LOCAL_STORE_PATH +from dcd_mapping.mavedb_data import LOCAL_STORE_PATH, patch_target_sequence_type from dcd_mapping.resource_utils import http_download from dcd_mapping.schemas import ( AlignmentResult, @@ -441,7 +442,7 @@ def parse_cdot_mapping(cdot_mapping: dict, silent: bool) -> AlignmentResult: def build_alignment_result( metadata: ScoresetMetadata, silent: bool -) -> dict[str, AlignmentResult | None]: +) -> Mapping[str, AlignmentResult | None]: # NOTE: Score set must contain all accession-based target genes or all sequence-based target genes # This decision was made because it is most efficient to run BLAT all together, so the alignment function # works on an entire score set rather than per target gene. @@ -462,7 +463,30 @@ def build_alignment_result( score_set_type = "sequence" if score_set_type == "sequence": - alignment_result = align(metadata, silent) + try: + alignment_result = align(metadata, silent) + except AlignmentError as e: + failed_at_nucleotide_level = any( + target_gene.target_sequence_type == TargetSequenceType.DNA + for target_gene in metadata.target_genes.values() + ) + + if failed_at_nucleotide_level: + msg = f"BLAT alignment failed for {metadata.urn} at the nucleotide level. This alignment will be retried at the protein level." + _logger.warning(msg) + else: + raise AlignmentError from e + + # So long as force=True, the content of the records dict is irrelevant. + try: + alignment_result = align( + patch_target_sequence_type(metadata, {}, force=True), silent + ) + except AlignmentError as e2: + msg = f"BLAT alignment failed for {metadata.urn} at the protein level after failing at the nucleotide level." + _logger.error(msg) + raise AlignmentError(msg) from e2 + else: alignment_result = fetch_alignment(metadata, silent) diff --git a/src/dcd_mapping/main.py b/src/dcd_mapping/main.py index dac25d4..bcb3864 100644 --- a/src/dcd_mapping/main.py +++ b/src/dcd_mapping/main.py @@ -346,7 +346,7 @@ async def map_scoreset_urn( try: metadata = get_scoreset_metadata(urn, store_path) records = get_scoreset_records(metadata, silent, store_path) - metadata = patch_target_sequence_type(metadata, records) + metadata = patch_target_sequence_type(metadata, records, force=False) except ScoresetNotSupportedError as e: _emit_info(f"Score set not supported: {e}", silent, logging.ERROR) final_output = write_scoreset_mapping_to_json( diff --git a/src/dcd_mapping/mavedb_data.py b/src/dcd_mapping/mavedb_data.py index b3510b4..1dad2b2 100644 --- a/src/dcd_mapping/mavedb_data.py +++ b/src/dcd_mapping/mavedb_data.py @@ -326,21 +326,41 @@ def get_scoreset_records( def patch_target_sequence_type( - metadata: ScoresetMetadata, records: dict + metadata: ScoresetMetadata, records: dict, force: bool = False ) -> ScoresetMetadata: """If target sequence type is DNA but no nucleotide variants are defined, treat the target as if it were a protein level target. This avoids BLAT errors in cases where the target sequence was codon-optimized for a non-human organism """ + patch_sequence_type = force or any( + target.target_sequence_type == TargetSequenceType.DNA + and not any(record.hgvs_nt for record in records.get(target_label, [])) + for target_label, target in metadata.target_genes.items() + ) + + if not patch_sequence_type: + msg = f"Not patching target sequence type for {metadata.urn}. Either force=True (was {force}), or at least one target has nucleotide-level variants." + _logger.debug(msg) + return metadata + for target_label, target in metadata.target_genes.items(): - if target.target_sequence_type == TargetSequenceType.DNA and not any( - record.hgvs_nt for record in records.get(target_label, []) - ): - msg = f"Changing target sequence type for {metadata.urn} target {target_label} from DNA to protein because target only has protein-level variants" - _logger.info(msg) - target.target_sequence = _get_protein_sequence(target.target_sequence) - target.target_sequence_type = TargetSequenceType.PROTEIN + if not target.target_sequence: + msg = f"Cannot patch target sequence type for {metadata.urn} target {target_label} because no target sequence is available." + _logger.debug(msg) + continue + + if target.target_gene_category != "protein_coding": + msg = f"Cannot patch target sequence type for {metadata.urn} target {target_label} because target gene category is {target.target_gene_category}, not protein_coding." + _logger.debug(msg) + continue + + msg = f"Changing target sequence type for {metadata.urn} target {target_label} from DNA to protein. (force was {force})." + _logger.info(msg) + + target.target_sequence = _get_protein_sequence(target.target_sequence) + target.target_sequence_type = TargetSequenceType.PROTEIN + return metadata diff --git a/src/dcd_mapping/transcripts.py b/src/dcd_mapping/transcripts.py index 00080d3..9ec4a7a 100644 --- a/src/dcd_mapping/transcripts.py +++ b/src/dcd_mapping/transcripts.py @@ -1,6 +1,7 @@ """Select best reference sequence.""" import logging import re +from collections.abc import Mapping from Bio.Data.CodonTable import IUPACData from Bio.Seq import Seq @@ -345,7 +346,7 @@ async def select_transcript( async def select_transcripts( scoreset_metadata: ScoresetMetadata, records: dict[str, list[ScoreRow]], - align_results: dict[str, AlignmentResult | None], + align_results: Mapping[str, AlignmentResult | None], ) -> dict[str, TxSelectResult | Exception | None]: """Select appropriate human reference sequence for each target in a score set. :param scoreset_metadata: Metadata for score set from MaveDB API diff --git a/src/dcd_mapping/vrs_map.py b/src/dcd_mapping/vrs_map.py index db81d57..18642ff 100644 --- a/src/dcd_mapping/vrs_map.py +++ b/src/dcd_mapping/vrs_map.py @@ -842,6 +842,7 @@ def _construct_vrs_allele( ) -> Allele | Haplotype: alleles: list[Allele] = [] for hgvs_string in hgvs_strings: + _logger.info("Processing HGVS string: %s", hgvs_string) allele = translate_hgvs_to_vrs(hgvs_string) if pre_map: