Source code for varannote.databases.clinvar

#!/usr/bin/env python3
"""
ClinVar Database Integration

Real-time integration with NCBI ClinVar database for clinical significance annotations.
Uses both REST API and local VCF files for comprehensive coverage.
"""

import requests
import json
import time
from typing import Dict, List, Optional, Tuple
from pathlib import Path
import pandas as pd
from urllib.parse import quote

[docs] class ClinVarDatabase: """ ClinVar database integration for clinical variant significance Provides access to: - Clinical significance classifications - Review status information - Condition information - Submission details """
[docs] def __init__(self, cache_dir: Optional[str] = None, use_cache: bool = True): """ Initialize ClinVar database connection Args: cache_dir: Directory for caching results use_cache: Whether to use local caching """ self.base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils" self.clinvar_api = "https://www.ncbi.nlm.nih.gov/clinvar/api/v2" self.cache_dir = Path(cache_dir) if cache_dir else Path.home() / ".varannote" / "cache" self.use_cache = use_cache # Create cache directory if self.use_cache: self.cache_dir.mkdir(parents=True, exist_ok=True) # Rate limiting self.last_request_time = 0 self.min_request_interval = 0.34 # NCBI allows ~3 requests per second # Clinical significance mapping self.significance_mapping = { "Pathogenic": "Pathogenic", "Likely pathogenic": "Likely_pathogenic", "Uncertain significance": "Uncertain_significance", "Likely benign": "Likely_benign", "Benign": "Benign", "Pathogenic/Likely pathogenic": "Pathogenic", "Benign/Likely benign": "Benign", "Conflicting interpretations of pathogenicity": "Conflicting", "not provided": "Not_provided" }
def _rate_limit(self): """Implement rate limiting for API requests""" current_time = time.time() time_since_last = current_time - self.last_request_time if time_since_last < self.min_request_interval: sleep_time = self.min_request_interval - time_since_last time.sleep(sleep_time) self.last_request_time = time.time() def _get_cache_path(self, variant_key: str) -> Path: """Get cache file path for variant""" safe_key = variant_key.replace(":", "_").replace(">", "_") return self.cache_dir / f"clinvar_{safe_key}.json" def _load_from_cache(self, variant_key: str) -> Optional[Dict]: """Load annotation from cache""" if not self.use_cache: return None cache_path = self._get_cache_path(variant_key) if cache_path.exists(): try: with open(cache_path, 'r') as f: return json.load(f) except Exception: return None return None def _save_to_cache(self, variant_key: str, data: Dict): """Save annotation to cache""" if not self.use_cache: return cache_path = self._get_cache_path(variant_key) try: with open(cache_path, 'w') as f: json.dump(data, f, indent=2) except Exception: pass
[docs] def get_variant_annotation(self, chrom: str, pos: int, ref: str, alt: str) -> Dict: """ Get ClinVar annotation for a specific variant Args: chrom: Chromosome (e.g., "17", "X") pos: Position (1-based) ref: Reference allele alt: Alternative allele Returns: Dictionary with ClinVar annotations """ variant_key = f"{chrom}:{pos}:{ref}>{alt}" # Check cache first cached_result = self._load_from_cache(variant_key) if cached_result: return cached_result # Try multiple search strategies annotations = {} # Strategy 1: Search by genomic coordinates genomic_annotations = self._search_by_coordinates(chrom, pos, ref, alt) if genomic_annotations: annotations.update(genomic_annotations) # Strategy 2: Search by HGVS notation (if we can construct it) hgvs_annotations = self._search_by_hgvs(chrom, pos, ref, alt) if hgvs_annotations: annotations.update(hgvs_annotations) # If no results, return empty annotation if not annotations: annotations = { "clinvar_significance": None, "clinvar_id": None, "clinvar_review_status": None, "clinvar_conditions": None } # Cache the result self._save_to_cache(variant_key, annotations) return annotations
def _search_by_coordinates(self, chrom: str, pos: int, ref: str, alt: str) -> Optional[Dict]: """Search ClinVar by genomic coordinates""" try: self._rate_limit() # Construct search query # Format: chr17:43044295[chrpos] AND G>A[variant] search_term = f"chr{chrom}:{pos}[chrpos] AND {ref}>{alt}[variant]" # Search using ESearch esearch_url = f"{self.base_url}/esearch.fcgi" esearch_params = { "db": "clinvar", "term": search_term, "retmode": "json", "retmax": "10" } response = requests.get(esearch_url, params=esearch_params, timeout=30) response.raise_for_status() search_data = response.json() if not search_data.get("esearchresult", {}).get("idlist"): return None # Get detailed information using ESummary ids = search_data["esearchresult"]["idlist"][:5] # Limit to first 5 results return self._get_variant_details(ids) except Exception as e: print(f"Warning: ClinVar coordinate search failed: {e}") return None def _search_by_hgvs(self, chrom: str, pos: int, ref: str, alt: str) -> Optional[Dict]: """Search ClinVar by HGVS notation""" try: # Simple HGVS construction for SNVs if len(ref) == 1 and len(alt) == 1: hgvs_g = f"NC_000{chrom.zfill(2)}.11:g.{pos}{ref}>{alt}" self._rate_limit() search_term = f'"{hgvs_g}"[variant name]' esearch_url = f"{self.base_url}/esearch.fcgi" esearch_params = { "db": "clinvar", "term": search_term, "retmode": "json", "retmax": "5" } response = requests.get(esearch_url, params=esearch_params, timeout=30) response.raise_for_status() search_data = response.json() if not search_data.get("esearchresult", {}).get("idlist"): return None ids = search_data["esearchresult"]["idlist"] return self._get_variant_details(ids) except Exception as e: print(f"Warning: ClinVar HGVS search failed: {e}") return None def _get_variant_details(self, clinvar_ids: List[str]) -> Optional[Dict]: """Get detailed variant information from ClinVar IDs""" try: self._rate_limit() # Use ESummary to get detailed information esummary_url = f"{self.base_url}/esummary.fcgi" esummary_params = { "db": "clinvar", "id": ",".join(clinvar_ids), "retmode": "json" } response = requests.get(esummary_url, params=esummary_params, timeout=30) response.raise_for_status() summary_data = response.json() if "result" not in summary_data: return None # Process the first valid result for clinvar_id in clinvar_ids: if clinvar_id in summary_data["result"]: variant_data = summary_data["result"][clinvar_id] # Extract clinical significance clinical_significance = variant_data.get("clinical_significance", "") mapped_significance = self.significance_mapping.get( clinical_significance, clinical_significance ) # Extract other information review_status = variant_data.get("review_status", "") conditions = variant_data.get("condition_list", []) # Format conditions condition_names = [] if isinstance(conditions, list): for condition in conditions: if isinstance(condition, dict): condition_names.append(condition.get("name", "")) return { "clinvar_significance": mapped_significance, "clinvar_id": f"VCV{variant_data.get('accession', clinvar_id)}", "clinvar_review_status": review_status, "clinvar_conditions": "; ".join(condition_names) if condition_names else None, "clinvar_last_evaluated": variant_data.get("last_evaluated", None) } return None except Exception as e: print(f"Warning: ClinVar details retrieval failed: {e}") return None
[docs] def batch_annotate(self, variants: List[Dict]) -> List[Dict]: """ Annotate multiple variants with ClinVar data Args: variants: List of variant dictionaries Returns: List of variants with ClinVar annotations added """ annotated_variants = [] for i, variant in enumerate(variants): print(f"Annotating variant {i+1}/{len(variants)} with ClinVar...") try: clinvar_data = self.get_variant_annotation( variant["CHROM"], variant["POS"], variant["REF"], variant["ALT"] ) # Add ClinVar annotations to variant annotated_variant = {**variant, **clinvar_data} annotated_variants.append(annotated_variant) except Exception as e: print(f"Warning: Failed to annotate variant {variant.get('variant_id', 'unknown')}: {e}") # Add empty annotations annotated_variant = { **variant, "clinvar_significance": None, "clinvar_id": None, "clinvar_review_status": None, "clinvar_conditions": None } annotated_variants.append(annotated_variant) return annotated_variants
[docs] def get_database_info(self) -> Dict: """Get information about ClinVar database""" return { "name": "ClinVar", "description": "NCBI database of genomic variation and human health", "url": "https://www.ncbi.nlm.nih.gov/clinvar/", "api_url": self.clinvar_api, "version": "Current", "last_updated": "Real-time", "data_types": [ "Clinical significance", "Review status", "Condition information", "Submission details" ] }