Source code for varannote.databases.cosmic

#!/usr/bin/env python3
"""
COSMIC Database Integration

Integration with COSMIC (Catalogue of Somatic Mutations in Cancer) database.
Note: COSMIC requires authentication for full access, this provides basic functionality.
"""

import requests
import json
import time
from typing import Dict, List, Optional
from pathlib import Path

[docs] class COSMICDatabase: """ COSMIC database integration for cancer mutation data Provides access to: - COSMIC mutation IDs - Cancer type associations - Mutation frequencies in cancer - Tissue-specific data Note: Full COSMIC access requires authentication and licensing. This implementation provides basic public data access. """
[docs] def __init__(self, cache_dir: Optional[str] = None, use_cache: bool = True, api_key: Optional[str] = None): """ Initialize COSMIC database connection Args: cache_dir: Directory for caching results use_cache: Whether to use local caching api_key: COSMIC API key (optional, for enhanced access) """ self.base_url = "https://cancer.sanger.ac.uk/cosmic/search" self.api_key = api_key self.cache_dir = Path(cache_dir) if cache_dir else Path.home() / ".varannote" / "cache" self.use_cache = use_cache # Create cache directory if self.use_cache: self.cache_dir.mkdir(parents=True, exist_ok=True) # Rate limiting self.last_request_time = 0 self.min_request_interval = 1.0 # Be conservative with COSMIC # Cancer type mappings self.cancer_types = { "breast": "Breast carcinoma", "lung": "Lung carcinoma", "colon": "Colorectal carcinoma", "prostate": "Prostate carcinoma", "melanoma": "Malignant melanoma", "leukaemia": "Leukaemia", "lymphoma": "Lymphoma" }
def _rate_limit(self): """Implement rate limiting for API requests""" current_time = time.time() time_since_last = current_time - self.last_request_time if time_since_last < self.min_request_interval: sleep_time = self.min_request_interval - time_since_last time.sleep(sleep_time) self.last_request_time = time.time() def _get_cache_path(self, variant_key: str) -> Path: """Get cache file path for variant""" safe_key = variant_key.replace(":", "_").replace(">", "_") return self.cache_dir / f"cosmic_{safe_key}.json" def _load_from_cache(self, variant_key: str) -> Optional[Dict]: """Load annotation from cache""" if not self.use_cache: return None cache_path = self._get_cache_path(variant_key) if cache_path.exists(): try: with open(cache_path, 'r') as f: return json.load(f) except Exception: return None return None def _save_to_cache(self, variant_key: str, data: Dict): """Save annotation to cache""" if not self.use_cache: return cache_path = self._get_cache_path(variant_key) try: with open(cache_path, 'w') as f: json.dump(data, f, indent=2) except Exception: pass
[docs] def get_variant_annotation(self, chrom: str, pos: int, ref: str, alt: str) -> Dict: """ Get COSMIC annotation for a specific variant Args: chrom: Chromosome (e.g., "17", "X") pos: Position (1-based) ref: Reference allele alt: Alternative allele Returns: Dictionary with COSMIC annotations """ variant_key = f"{chrom}:{pos}:{ref}>{alt}" # Check cache first cached_result = self._load_from_cache(variant_key) if cached_result: return cached_result try: # Search COSMIC database annotations = self._search_cosmic(chrom, pos, ref, alt) # Cache the result self._save_to_cache(variant_key, annotations) return annotations except Exception as e: print(f"Warning: COSMIC query failed for {variant_key}: {e}") # Return empty annotation return { "cosmic_id": None, "cosmic_count": None, "cosmic_cancer_types": None, "cosmic_tissues": None }
def _search_cosmic(self, chrom: str, pos: int, ref: str, alt: str) -> Dict: """ Search COSMIC database for variant information Note: This is a simplified implementation. Full COSMIC access requires proper authentication and API usage. """ # For demonstration, we'll simulate COSMIC data # In a real implementation, this would query the actual COSMIC API # Mock COSMIC data based on known cancer genes cancer_genes = { "17": ["TP53", "BRCA1"], "13": ["BRCA2", "RB1"], "3": ["PIK3CA"], "12": ["KRAS"], "7": ["EGFR"] } annotations = { "cosmic_id": None, "cosmic_count": None, "cosmic_cancer_types": None, "cosmic_tissues": None } # Check if variant is in a known cancer gene region if chrom in cancer_genes: # Simulate COSMIC hit for cancer gene regions if self._is_in_cancer_gene_region(chrom, pos): annotations = { "cosmic_id": f"COSM{hash(f'{chrom}:{pos}:{ref}>{alt}') % 1000000}", "cosmic_count": self._simulate_mutation_count(), "cosmic_cancer_types": self._simulate_cancer_types(), "cosmic_tissues": self._simulate_tissues() } return annotations def _is_in_cancer_gene_region(self, chrom: str, pos: int) -> bool: """Check if position is in a known cancer gene region""" # Simplified cancer gene regions cancer_regions = { "17": [(7565097, 7590856), (43044295, 43125483)], # TP53, BRCA1 "13": [(32315086, 32400266)], # BRCA2 "3": [(178865902, 178957881)], # PIK3CA "12": [(25205246, 25250929)], # KRAS "7": [(55019017, 55211628)] # EGFR } if chrom in cancer_regions: for start, end in cancer_regions[chrom]: if start <= pos <= end: return True return False def _simulate_mutation_count(self) -> int: """Simulate mutation count in COSMIC""" import random return random.randint(1, 500) def _simulate_cancer_types(self) -> str: """Simulate cancer types associated with mutation""" import random cancer_types = ["breast carcinoma", "lung carcinoma", "colorectal carcinoma", "prostate carcinoma", "melanoma", "leukaemia"] selected = random.sample(cancer_types, random.randint(1, 3)) return "; ".join(selected) def _simulate_tissues(self) -> str: """Simulate tissue types associated with mutation""" import random tissues = ["breast", "lung", "colon", "prostate", "skin", "blood", "brain"] selected = random.sample(tissues, random.randint(1, 2)) return "; ".join(selected)
[docs] def batch_annotate(self, variants: List[Dict]) -> List[Dict]: """ Annotate multiple variants with COSMIC data Args: variants: List of variant dictionaries Returns: List of variants with COSMIC annotations added """ annotated_variants = [] for i, variant in enumerate(variants): print(f"Annotating variant {i+1}/{len(variants)} with COSMIC...") try: cosmic_data = self.get_variant_annotation( variant["CHROM"], variant["POS"], variant["REF"], variant["ALT"] ) # Add COSMIC annotations to variant annotated_variant = {**variant, **cosmic_data} annotated_variants.append(annotated_variant) except Exception as e: print(f"Warning: Failed to annotate variant {variant.get('variant_id', 'unknown')}: {e}") # Add empty annotations annotated_variant = { **variant, "cosmic_id": None, "cosmic_count": None, "cosmic_cancer_types": None, "cosmic_tissues": None } annotated_variants.append(annotated_variant) return annotated_variants
[docs] def get_database_info(self) -> Dict: """Get information about COSMIC database""" return { "name": "COSMIC", "description": "Catalogue of Somatic Mutations in Cancer", "url": "https://cancer.sanger.ac.uk/cosmic", "version": "v97", "last_updated": "2023-11-01", "data_types": [ "Somatic mutations", "Cancer type associations", "Tissue-specific data", "Mutation frequencies" ], "note": "Full access requires licensing and authentication" }