Source code for charger.classifier

from enum import Enum, auto
from typing import Any, Dict, List, Optional, Set

from loguru import logger
from pysam import TabixFile

from .acmg_modules import run_pm4, run_pvs1
from .config import ACMG_MODULES, CHARGER_MODULES, CharGerConfig
from .custom_modules import run_pmc1, run_ppc1, run_ppc2, run_psc1
from .io import read_lines, read_tsv
from .result import CharGerResult
from .variant import ClinicalSignificance, GeneInheritanceMode, Variant

try:
    from typing import Final
except ImportError:
    # Backport additional typings prior to python 3.8
    from typing_extensions import Final  # type: ignore


logger.disable("charger")  # Disable emit logs by default


[docs]class ModuleAvailability(Enum): """Availability of a ACMG or CharGer modules. Used by :attr:`CharGer._acmg_module_availability` and :attr:`CharGer._charger_module_availability`. Examples: Skip PVS1 module by: >>> charger = CharGer(CharGerConfig()) >>> charger._acmg_module_availability = ModuleAvailability.USER_DISABLED """ ACTIVE = auto() """The module is active.""" USER_DISABLED = auto() """The module is disabled by user. The module will be skipped.""" INVALID_SETUP = auto() """ The module has invalid setup, such as no additional annotation provided. The module will be skipped. """
InheritanceGenesType = Dict[str, Optional[GeneInheritanceMode]] """Type hint alias for :attr:`CharGer.inheritance_genes`."""
[docs]class CharGer: """ Variant classifier. Args: config: CharGer's configurations. See :class:`~charger.config.CharGerConfig` for details to set it up. Examples: >>> config = CharGerConfig(...) >>> charger = CharGer() >>> charger.setup() >>> charger.run_acmg_modules() >>> charger.run_charger_modules() """ def __init__(self, config: CharGerConfig): self.config: Final[CharGerConfig] = config """Configuration as a :class:`~charger.config.CharGerConfig` object.""" self.input_variants: List[Variant] = [] """Parsed input variants.""" self.pathogenic_variants: List[Variant] = [] """Known pathogenic variants.""" self.inheritance_genes: InheritanceGenesType = {} """Variant inheritance dominance mode of the genes for PVS1 module.""" self.pp2_genes: Set[str] = set() """Genes marked for PP2 module.""" self.bp1_genes: Set[str] = set() """Genes marked for BP1 module.""" self.results: List[CharGerResult] = [] """ Classification results of the input variants. The length and order should always be the same as :attr:`input_variants`. """ self._acmg_module_availability: Dict[str, ModuleAvailability] = { m: ModuleAvailability.ACTIVE for module_type, modules in ACMG_MODULES.items() for m in modules } """The availability of all ACMG modules. By default all modules are active.""" self._charger_module_availability: Dict[str, ModuleAvailability] = { m: ModuleAvailability.ACTIVE for module_type, modules in CHARGER_MODULES.items() for m in modules } """The availability of all CharGer modules. By default all modules are active."""
[docs] def setup(self) -> None: """Setup all the intput data and annotation. Sequentially it calls: 1. :meth:`_validate_config` 2. :meth:`_read_input_variants` 3. :meth:`_read_pathogenic_variants` 4. :meth:`_read_inheritance_gene_table` 5. :meth:`_read_pp2_gene_list` 6. :meth:`_read_bp1_gene_list` 7. :meth:`_match_clinvar` """ self._validate_config() self._read_input_variants() self._read_pathogenic_variants() self._read_inheritance_gene_table() self._read_pp2_gene_list() self._read_bp1_gene_list() self._match_clinvar()
def _validate_config(self) -> None: """Validate the configuration.""" logger.info("Validate the given config") logger.debug(f"Given config: {self.config!r}") def _read_input_variants(self) -> None: """Read input VCF and set up the result template Load :attr:`input_variants` from :attr:`self.config.input <.CharGerConfig.input>`. Also populate :attr:`results` matching the input variant. """ if self.config.input is None: raise ValueError("No input file is given in the config") logger.info(f"Read input VCF from {self.config.input}") # TODO: Skip variants with filter, or with high allele frequency # num_skipped_variants: Dict[str, int] = {"has_filter": 0} for variant in Variant.read_and_parse_vcf(self.config.input): # # Skip the variant with filter (not PASS) # if variant.filter: # logger.warning( # f"{variant} has filter {','.join(variant.filter)}. Skipped" # ) # num_skipped_variants["has_filter"] += 1 # continue self.input_variants.append(variant) # We also create the result template self.results.append(CharGerResult(variant)) logger.success( f"Read total {len(self.input_variants):,d} variants from the input VCF" ) def _read_pathogenic_variants(self) -> None: """Read known pathogenic variants. Load :attr:`pathogenic_variants` from :attr:`self.config.pathogenic_variant <.CharGerConfig.pathogenic_variant>`. """ if self.config.pathogenic_variant is None: return logger.info(f"Read pathogenic VCF from {self.config.pathogenic_variant}") self.pathogenic_variants = list( Variant.read_and_parse_vcf(self.config.pathogenic_variant) ) logger.info( f"Read total {len(self.pathogenic_variants):,d} pathogenic variants from the VCF" ) def _read_inheritance_gene_table(self) -> None: """Read inheritance gene table for PVS1 module. Load :attr:`inheritance_genes` from :attr:`self.config.inheritance_gene_table <.CharGerConfig.inheritance_gene_table>`. Skip PVS1 module if it's not provided. """ tsv_pth = self.config.inheritance_gene_table # Disable PVS1 module if no list is provided if tsv_pth is None: logger.warning( "Inheritance gene table is not provided, CharGer cannot make ACMG PVS1/PM4 calls " "or CharGer PSC1/PPC1 calls. Disable all these modules" ) for m in ["PVS1", "PM4"]: self._acmg_module_availability[m] = ModuleAvailability.INVALID_SETUP for m in ["PSC1", "PPC1"]: self._charger_module_availability[m] = ModuleAvailability.INVALID_SETUP return else: logger.info( "Disable CharGer PMC1/PPC2 modules when inheritance gene table is provided" ) for m in ["PMC1", "PPC2"]: self._charger_module_availability[m] = ModuleAvailability.INVALID_SETUP if self.config.disease_specific: raise NotImplementedError( "Cannot read disease specific inheritance gene table" ) logger.info(f"Read inheritance gene table from {tsv_pth}") reader = read_tsv(tsv_pth, as_dict=False) header = next(reader) if len(header) < 3: logger.error( f"Expect inheritance gene table to have at least three columns; " f"only got {', '.join(header)}" ) raise ValueError(f"Invalid table format in {tsv_pth}") for (gene, diseases, modes_of_inheritance, *_) in reader: modes = GeneInheritanceMode.parse(modes_of_inheritance) self.inheritance_genes[gene] = modes logger.info( f"Loaded inheritance mode of {len(self.inheritance_genes):,d} genes" ) def _read_pp2_gene_list(self) -> None: """Read gene list for PP2 module. Load :attr:`pp2_genes` from :attr:`self.config.PP2_gene_list <.CharGerConfig.PP2_gene_list>`. Skip PP2 module if not provided. """ gene_list_pth = self.config.PP2_gene_list # Disable PP2 module if no list is provided if gene_list_pth is None: logger.warning( "CharGer cannot make PP2 calls without the given gene list. " "Disable PP2 module" ) self._acmg_module_availability["PP2"] = ModuleAvailability.INVALID_SETUP return logger.info(f"Read PP2 gene list from {gene_list_pth}") self.pp2_genes = set(line.strip() for line in read_lines(gene_list_pth)) logger.info(f"Marked {len(self.pp2_genes):,d} genes for PP2") def _read_bp1_gene_list(self) -> None: """Read gene list for BP1 module. Load :attr:`bp1_genes` from :attr:`self.config.BP1_gene_list <.CharGerConfig.BP1_gene_list>`. Skip BP1 module if not provided. """ gene_list_pth = self.config.BP1_gene_list # Disable BP1 module if no list is provided if gene_list_pth is None: logger.warning( "CharGer cannot make BP1 calls without the given gene list. " "Disable BP1 module" ) self._acmg_module_availability["BP1"] = ModuleAvailability.INVALID_SETUP return logger.info(f"Read BP1 gene list from {gene_list_pth}") self.bp1_genes = set(line.strip() for line in read_lines(gene_list_pth)) logger.info(f"Marked {len(self.bp1_genes):,d} genes for BP1") @staticmethod def _match_clinvar_one_variant( variant: Variant, tabix: TabixFile, cols: List[str] ) -> Optional[Dict[str, Any]]: """Match the variant to the given ClinVar tabix table. Args: variant: Variant to be matched tabix: Tabix indexed CliVar table cols: All ClinVar columns in the table Returns: None if no ClinVar match. When matched, returns a `dict` of the clinvar record, where the key ``final_clinical_significance`` stores the final clinical significance type in :class:`ClinicalSignificance`. """ try: # TabixFile.fetch will raise ValueError if the given region is out of bound row_iter = tabix.fetch( region=f"{variant.chrom}:{variant.start_pos}-{variant.end_pos}" ) except ValueError as e: # Do nothing if it's querying for a chromosome not in the ClinVar table if "could not create iterator for region" not in e.args[0]: logger.opt(exception=e).debug(f"Tabix fetch ClinVar failed: {e}") return None for row in row_iter: record = dict(zip(cols, row.split("\t"))) if ( int(record["start"]) == variant.start_pos and int(record["stop"]) == variant.end_pos and record["alt"] == variant.alt_allele ): if record["ref"] != variant.ref_allele: logger.warning( f"{variant!r} got a clinvar match but their reference alleles are different: " f"{variant.ref_allele!r} != {record['ref']!r}" ) # Parse the clinical significance of the record record[ "final_clinical_significance" ] = ClinicalSignificance.parse_clinvar_record(record) return record return None def _match_clinvar(self) -> None: """Match the input variant with the ClinVar table. Update :attr:`CharGerResult.clinvar` the variant matches a ClinVar record by calling :meth:`_match_clinvar_one_variant`. """ if self.config.clinvar_table is None: logger.info("Skip matching ClinVar") return logger.info( f"Match input variants with ClinVar table at {self.config.clinvar_table}" ) clinvar_match_num = 0 with TabixFile(str(self.config.clinvar_table), encoding="utf8") as tabix: cols = tabix.header[0][len("#") :].split("\t") for result in self.results: record = self._match_clinvar_one_variant(result.variant, tabix, cols) if record is not None: result.clinvar = record clinvar_match_num += 1 logger.success( f"Matched {clinvar_match_num:,d} out of {len(self.input_variants):,d} input variants to a ClinVar record" ) @staticmethod def _run_or_skip_module( module_name: str, module_avail: "ModuleAvailability" ) -> bool: if module_avail is ModuleAvailability.ACTIVE: logger.info("Running {name} module", name=module_name) return True else: logger.info("Skipped {name} module", name=module_name) return False
[docs] def run_acmg_modules(self) -> None: """Run all ACMG modules. See :mod:`~charger.acmg_modules` for all the currently implemented modules. """ logger.info("Run all ACMG modules") def run_or_skip(module_name: str): return self._run_or_skip_module( module_name, self._acmg_module_availability[module_name] ) # PVS1 if run_or_skip("PVS1"): for result in self.results: run_pvs1(result, self.inheritance_genes) # PM4 if run_or_skip("PM4"): for result in self.results: run_pm4(result, self.inheritance_genes)
[docs] def run_charger_modules(self) -> None: """Run all CharGer customized modules. See :mod:`~charger.custom_modules` for all the currently implemented modules. """ logger.info("Run all CharGer modules") def run_or_skip(module_name: str): return self._run_or_skip_module( module_name, self._charger_module_availability[module_name] ) # PSC1 if run_or_skip("PSC1"): for result in self.results: run_psc1(result, self.inheritance_genes) # PMC1 if run_or_skip("PMC1"): for result in self.results: run_pmc1(result, self.inheritance_genes) # PPC1 if run_or_skip("PPC1"): for result in self.results: run_ppc1(result, self.inheritance_genes) # PPC if run_or_skip("PPC2"): for result in self.results: run_ppc2(result, self.inheritance_genes)