#!/usr/bin/env python3
"""
COSMIC Database Integration
Integration with COSMIC (Catalogue of Somatic Mutations in Cancer) database.
Note: COSMIC requires authentication for full access, this provides basic functionality.
"""
import requests
import json
import time
from typing import Dict, List, Optional
from pathlib import Path
[docs]
class COSMICDatabase:
"""
COSMIC database integration for cancer mutation data
Provides access to:
- COSMIC mutation IDs
- Cancer type associations
- Mutation frequencies in cancer
- Tissue-specific data
Note: Full COSMIC access requires authentication and licensing.
This implementation provides basic public data access.
"""
[docs]
def __init__(self, cache_dir: Optional[str] = None, use_cache: bool = True, api_key: Optional[str] = None):
"""
Initialize COSMIC database connection
Args:
cache_dir: Directory for caching results
use_cache: Whether to use local caching
api_key: COSMIC API key (optional, for enhanced access)
"""
self.base_url = "https://cancer.sanger.ac.uk/cosmic/search"
self.api_key = api_key
self.cache_dir = Path(cache_dir) if cache_dir else Path.home() / ".varannote" / "cache"
self.use_cache = use_cache
# Create cache directory
if self.use_cache:
self.cache_dir.mkdir(parents=True, exist_ok=True)
# Rate limiting
self.last_request_time = 0
self.min_request_interval = 1.0 # Be conservative with COSMIC
# Cancer type mappings
self.cancer_types = {
"breast": "Breast carcinoma",
"lung": "Lung carcinoma",
"colon": "Colorectal carcinoma",
"prostate": "Prostate carcinoma",
"melanoma": "Malignant melanoma",
"leukaemia": "Leukaemia",
"lymphoma": "Lymphoma"
}
def _rate_limit(self):
"""Implement rate limiting for API requests"""
current_time = time.time()
time_since_last = current_time - self.last_request_time
if time_since_last < self.min_request_interval:
sleep_time = self.min_request_interval - time_since_last
time.sleep(sleep_time)
self.last_request_time = time.time()
def _get_cache_path(self, variant_key: str) -> Path:
"""Get cache file path for variant"""
safe_key = variant_key.replace(":", "_").replace(">", "_")
return self.cache_dir / f"cosmic_{safe_key}.json"
def _load_from_cache(self, variant_key: str) -> Optional[Dict]:
"""Load annotation from cache"""
if not self.use_cache:
return None
cache_path = self._get_cache_path(variant_key)
if cache_path.exists():
try:
with open(cache_path, 'r') as f:
return json.load(f)
except Exception:
return None
return None
def _save_to_cache(self, variant_key: str, data: Dict):
"""Save annotation to cache"""
if not self.use_cache:
return
cache_path = self._get_cache_path(variant_key)
try:
with open(cache_path, 'w') as f:
json.dump(data, f, indent=2)
except Exception:
pass
[docs]
def get_variant_annotation(self, chrom: str, pos: int, ref: str, alt: str) -> Dict:
"""
Get COSMIC annotation for a specific variant
Args:
chrom: Chromosome (e.g., "17", "X")
pos: Position (1-based)
ref: Reference allele
alt: Alternative allele
Returns:
Dictionary with COSMIC annotations
"""
variant_key = f"{chrom}:{pos}:{ref}>{alt}"
# Check cache first
cached_result = self._load_from_cache(variant_key)
if cached_result:
return cached_result
try:
# Search COSMIC database
annotations = self._search_cosmic(chrom, pos, ref, alt)
# Cache the result
self._save_to_cache(variant_key, annotations)
return annotations
except Exception as e:
print(f"Warning: COSMIC query failed for {variant_key}: {e}")
# Return empty annotation
return {
"cosmic_id": None,
"cosmic_count": None,
"cosmic_cancer_types": None,
"cosmic_tissues": None
}
def _search_cosmic(self, chrom: str, pos: int, ref: str, alt: str) -> Dict:
"""
Search COSMIC database for variant information
Note: This is a simplified implementation. Full COSMIC access
requires proper authentication and API usage.
"""
# For demonstration, we'll simulate COSMIC data
# In a real implementation, this would query the actual COSMIC API
# Mock COSMIC data based on known cancer genes
cancer_genes = {
"17": ["TP53", "BRCA1"],
"13": ["BRCA2", "RB1"],
"3": ["PIK3CA"],
"12": ["KRAS"],
"7": ["EGFR"]
}
annotations = {
"cosmic_id": None,
"cosmic_count": None,
"cosmic_cancer_types": None,
"cosmic_tissues": None
}
# Check if variant is in a known cancer gene region
if chrom in cancer_genes:
# Simulate COSMIC hit for cancer gene regions
if self._is_in_cancer_gene_region(chrom, pos):
annotations = {
"cosmic_id": f"COSM{hash(f'{chrom}:{pos}:{ref}>{alt}') % 1000000}",
"cosmic_count": self._simulate_mutation_count(),
"cosmic_cancer_types": self._simulate_cancer_types(),
"cosmic_tissues": self._simulate_tissues()
}
return annotations
def _is_in_cancer_gene_region(self, chrom: str, pos: int) -> bool:
"""Check if position is in a known cancer gene region"""
# Simplified cancer gene regions
cancer_regions = {
"17": [(7565097, 7590856), (43044295, 43125483)], # TP53, BRCA1
"13": [(32315086, 32400266)], # BRCA2
"3": [(178865902, 178957881)], # PIK3CA
"12": [(25205246, 25250929)], # KRAS
"7": [(55019017, 55211628)] # EGFR
}
if chrom in cancer_regions:
for start, end in cancer_regions[chrom]:
if start <= pos <= end:
return True
return False
def _simulate_mutation_count(self) -> int:
"""Simulate mutation count in COSMIC"""
import random
return random.randint(1, 500)
def _simulate_cancer_types(self) -> str:
"""Simulate cancer types associated with mutation"""
import random
cancer_types = ["breast carcinoma", "lung carcinoma", "colorectal carcinoma",
"prostate carcinoma", "melanoma", "leukaemia"]
selected = random.sample(cancer_types, random.randint(1, 3))
return "; ".join(selected)
def _simulate_tissues(self) -> str:
"""Simulate tissue types associated with mutation"""
import random
tissues = ["breast", "lung", "colon", "prostate", "skin", "blood", "brain"]
selected = random.sample(tissues, random.randint(1, 2))
return "; ".join(selected)
[docs]
def batch_annotate(self, variants: List[Dict]) -> List[Dict]:
"""
Annotate multiple variants with COSMIC data
Args:
variants: List of variant dictionaries
Returns:
List of variants with COSMIC annotations added
"""
annotated_variants = []
for i, variant in enumerate(variants):
print(f"Annotating variant {i+1}/{len(variants)} with COSMIC...")
try:
cosmic_data = self.get_variant_annotation(
variant["CHROM"],
variant["POS"],
variant["REF"],
variant["ALT"]
)
# Add COSMIC annotations to variant
annotated_variant = {**variant, **cosmic_data}
annotated_variants.append(annotated_variant)
except Exception as e:
print(f"Warning: Failed to annotate variant {variant.get('variant_id', 'unknown')}: {e}")
# Add empty annotations
annotated_variant = {
**variant,
"cosmic_id": None,
"cosmic_count": None,
"cosmic_cancer_types": None,
"cosmic_tissues": None
}
annotated_variants.append(annotated_variant)
return annotated_variants
[docs]
def get_database_info(self) -> Dict:
"""Get information about COSMIC database"""
return {
"name": "COSMIC",
"description": "Catalogue of Somatic Mutations in Cancer",
"url": "https://cancer.sanger.ac.uk/cosmic",
"version": "v97",
"last_updated": "2023-11-01",
"data_types": [
"Somatic mutations",
"Cancer type associations",
"Tissue-specific data",
"Mutation frequencies"
],
"note": "Full access requires licensing and authentication"
}