Source code for varannote.databases.dbsnp

#!/usr/bin/env python3
"""
dbSNP Database Integration

Integration with NCBI dbSNP database for variant identifiers and basic information.
"""

import requests
import json
import time
from typing import Dict, List, Optional
from pathlib import Path

[docs] class DbSNPDatabase: """ dbSNP database integration for variant identifiers Provides access to: - rsID identifiers - Variant validation status - Allele frequencies (when available) - Clinical significance flags """
[docs] def __init__(self, cache_dir: Optional[str] = None, use_cache: bool = True): """ Initialize dbSNP database connection Args: cache_dir: Directory for caching results use_cache: Whether to use local caching """ self.base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils" self.cache_dir = Path(cache_dir) if cache_dir else Path.home() / ".varannote" / "cache" self.use_cache = use_cache # Create cache directory if self.use_cache: self.cache_dir.mkdir(parents=True, exist_ok=True) # Rate limiting self.last_request_time = 0 self.min_request_interval = 0.34 # NCBI allows ~3 requests per second
def _rate_limit(self): """Implement rate limiting for API requests""" current_time = time.time() time_since_last = current_time - self.last_request_time if time_since_last < self.min_request_interval: sleep_time = self.min_request_interval - time_since_last time.sleep(sleep_time) self.last_request_time = time.time() def _get_cache_path(self, variant_key: str) -> Path: """Get cache file path for variant""" safe_key = variant_key.replace(":", "_").replace(">", "_") return self.cache_dir / f"dbsnp_{safe_key}.json" def _load_from_cache(self, variant_key: str) -> Optional[Dict]: """Load annotation from cache""" if not self.use_cache: return None cache_path = self._get_cache_path(variant_key) if cache_path.exists(): try: with open(cache_path, 'r') as f: return json.load(f) except Exception: return None return None def _save_to_cache(self, variant_key: str, data: Dict): """Save annotation to cache""" if not self.use_cache: return cache_path = self._get_cache_path(variant_key) try: with open(cache_path, 'w') as f: json.dump(data, f, indent=2) except Exception: pass
[docs] def get_variant_annotation(self, chrom: str, pos: int, ref: str, alt: str) -> Dict: """ Get dbSNP annotation for a specific variant Args: chrom: Chromosome (e.g., "17", "X") pos: Position (1-based) ref: Reference allele alt: Alternative allele Returns: Dictionary with dbSNP annotations """ variant_key = f"{chrom}:{pos}:{ref}>{alt}" # Check cache first cached_result = self._load_from_cache(variant_key) if cached_result: return cached_result try: # Search dbSNP by coordinates annotations = self._search_by_coordinates(chrom, pos, ref, alt) # Cache the result self._save_to_cache(variant_key, annotations) return annotations except Exception as e: print(f"Warning: dbSNP query failed for {variant_key}: {e}") # Return empty annotation return { "dbsnp_id": None, "dbsnp_validated": None, "dbsnp_maf": None }
def _search_by_coordinates(self, chrom: str, pos: int, ref: str, alt: str) -> Dict: """Search dbSNP by genomic coordinates""" self._rate_limit() # Construct search term search_term = f"chr{chrom}:{pos}[chrpos]" # Search using ESearch esearch_url = f"{self.base_url}/esearch.fcgi" esearch_params = { "db": "snp", "term": search_term, "retmode": "json", "retmax": "10" } response = requests.get(esearch_url, params=esearch_params, timeout=30) response.raise_for_status() search_data = response.json() if not search_data.get("esearchresult", {}).get("idlist"): return { "dbsnp_id": None, "dbsnp_validated": None, "dbsnp_maf": None } # Get detailed information using ESummary ids = search_data["esearchresult"]["idlist"][:5] return self._get_snp_details(ids, ref, alt) def _get_snp_details(self, snp_ids: List[str], ref: str, alt: str) -> Dict: """Get detailed SNP information from dbSNP IDs""" self._rate_limit() # Use ESummary to get detailed information esummary_url = f"{self.base_url}/esummary.fcgi" esummary_params = { "db": "snp", "id": ",".join(snp_ids), "retmode": "json" } response = requests.get(esummary_url, params=esummary_params, timeout=30) response.raise_for_status() summary_data = response.json() if "result" not in summary_data: return { "dbsnp_id": None, "dbsnp_validated": None, "dbsnp_maf": None } # Process results to find matching alleles for snp_id in snp_ids: if snp_id in summary_data["result"]: snp_data = summary_data["result"][snp_id] # Check if alleles match if self._alleles_match(snp_data, ref, alt): return { "dbsnp_id": f"rs{snp_data.get('rsid', snp_id)}", "dbsnp_validated": snp_data.get("validation", "").lower() == "true", "dbsnp_maf": self._extract_maf(snp_data) } # If no exact match found, return the first result if snp_ids and snp_ids[0] in summary_data["result"]: snp_data = summary_data["result"][snp_ids[0]] return { "dbsnp_id": f"rs{snp_data.get('rsid', snp_ids[0])}", "dbsnp_validated": snp_data.get("validation", "").lower() == "true", "dbsnp_maf": self._extract_maf(snp_data) } return { "dbsnp_id": None, "dbsnp_validated": None, "dbsnp_maf": None } def _alleles_match(self, snp_data: Dict, ref: str, alt: str) -> bool: """Check if SNP alleles match the query variant""" # Simple allele matching - could be improved alleles = snp_data.get("alleles", "") return ref in alleles and alt in alleles def _extract_maf(self, snp_data: Dict) -> Optional[float]: """Extract minor allele frequency from SNP data""" # This is simplified - real implementation would parse frequency data maf_str = snp_data.get("maf", "") if maf_str: try: return float(maf_str) except ValueError: return None return None
[docs] def batch_annotate(self, variants: List[Dict]) -> List[Dict]: """ Annotate multiple variants with dbSNP data Args: variants: List of variant dictionaries Returns: List of variants with dbSNP annotations added """ annotated_variants = [] for i, variant in enumerate(variants): print(f"Annotating variant {i+1}/{len(variants)} with dbSNP...") try: dbsnp_data = self.get_variant_annotation( variant["CHROM"], variant["POS"], variant["REF"], variant["ALT"] ) # Add dbSNP annotations to variant annotated_variant = {**variant, **dbsnp_data} annotated_variants.append(annotated_variant) except Exception as e: print(f"Warning: Failed to annotate variant {variant.get('variant_id', 'unknown')}: {e}") # Add empty annotations annotated_variant = { **variant, "dbsnp_id": None, "dbsnp_validated": None, "dbsnp_maf": None } annotated_variants.append(annotated_variant) return annotated_variants
[docs] def get_database_info(self) -> Dict: """Get information about dbSNP database""" return { "name": "dbSNP", "description": "NCBI database of single nucleotide polymorphisms", "url": "https://www.ncbi.nlm.nih.gov/snp/", "api_url": self.base_url, "version": "Build 155", "last_updated": "2023-05-09", "data_types": [ "rsID identifiers", "Validation status", "Minor allele frequencies", "Population data" ] }