Eqtl analysis using alphagenome

Hi AlphaGenome Team,

I am trying to use Alphagenome for some eqtl analysis task. I wanted to make sure that I am doing this correctly. I am attaching my code snippet below. I would highly appreciate if you could comment on the correctness of the approach.

Many thanks,
Sayan

selected_scorers = scorer_selections = {
    'rna_seq': True,}
variant = genome.Variant(
  chromosome=str(variant_row['CHROM']),
   position=int(variant_row['POS']),
   reference_bases=variant_row['REF'],    
    alternate_bases=variant_row['ALT'],     
    name=variant_row['variant_id'],  )  
    interval = variant.reference_interval.resize(sequence_length)
    variant_scores = dna_model.score_variant(
        interval=interval,
        variant=variant,
        variant_scorers=selected_scorers,
        organism=organism,
    )
    df_scores = variant_scorers.tidy_scores([variant_scores])
    # Extract score for the specific tissue and gene
    result = {
        'variant_id': variant_row['variant_id'],
        'chrom': variant_row['CHROM'],
        'pos': variant_row['POS'],
        'ref': variant_row['REF'],
        'alt': variant_row['ALT'],
    }
    result['tissue'] = variant_row['tissue']
    tissue_key = variant_row['tissue']
    gene_id = variant_row['gene_id_short']
    df = df_scores[
            (df_scores['gene_id'] == gene_id) & 
            (df_scores['gtex_tissue'] == tissue_key)
        ]['raw_score']
     result['alphagenome_score'] = df.values[0].mean()
     result['gene_id'] = variant_row['gene_id']
1 Like

I just realized it might not be clear if don’t provide the full script. So, here it is:

# %%
from io import StringIO
from alphagenome import colab_utils
from alphagenome.data import genome
from alphagenome.models import dna_client, variant_scorers

# %% [markdown]
# 

# %%

import pandas as pd
from tqdm import tqdm
import os

# %%
vcf = pd.read_csv('VCF FILE', sep='\t')

# %%
vcf = vcf.rename(columns={"chr": "CHROM", "pos": "POS", "ref": "REF", "alt": "ALT"})
required_columns = ['variant_id', 'CHROM', 'POS', 'REF', 'ALT']
for column in required_columns:
  if column not in vcf.columns:
    raise ValueError(f'VCF file is missing required column: {column}.')

organism = 'human'  # @param ["human", "mouse"] {type:"string"}


# %%
# @markdown Specify length of sequence around variants to predict:
sequence_length = '1MB'  # @param ["2KB", "16KB", "100KB", "500KB", "1MB"] { type:"string" }
sequence_length = dna_client.SUPPORTED_SEQUENCE_LENGTHS[
    f'SEQUENCE_LENGTH_{sequence_length}'
]

# %%
score_rna_seq = True

# %%
# @markdown Other settings:
download_predictions = False  # @param { type: "boolean" }

# %%

# Parse organism specification.
organism_map = {
    'human': dna_client.Organism.HOMO_SAPIENS,
    'mouse': dna_client.Organism.MUS_MUSCULUS,
}
organism = organism_map[organism]

# %%
# Parse scorer specification.
scorer_selections = {
    'rna_seq': score_rna_seq,
}

# %%
all_scorers = variant_scorers.RECOMMENDED_VARIANT_SCORERS

# %%
selected_scorers = [
    all_scorers[key]
    for key in all_scorers
    if scorer_selections.get(key.lower(), False)
]

# Remove any scorers or output types that are not supported for the chosen organism.
unsupported_scorers = [
    scorer
    for scorer in selected_scorers
    if (
        organism.value
        not in variant_scorers.SUPPORTED_ORGANISMS[scorer.base_variant_scorer]
    )
    | (
        (scorer.requested_output == dna_client.OutputType.PROCAP)
        & (organism == dna_client.Organism.MUS_MUSCULUS)
    )
]
if len(unsupported_scorers) > 0:
  print(
      f'Excluding {unsupported_scorers} scorers as they are not supported for'
      f' {organism}.'
  )
  for unsupported_scorer in unsupported_scorers:
    selected_scorers.remove(unsupported_scorer)



# %%

selected_scorers = [
    all_scorers[key]
    for key in all_scorers
    if scorer_selections.get(key.lower(), False)
]

# Remove any scorers or output types that are not supported for the chosen organism.
unsupported_scorers = [
    scorer
    for scorer in selected_scorers
    if (
        organism.value
        not in variant_scorers.SUPPORTED_ORGANISMS[scorer.base_variant_scorer]
    )
    | (
        (scorer.requested_output == dna_client.OutputType.PROCAP)
        & (organism == dna_client.Organism.MUS_MUSCULUS)
    )
]
if len(unsupported_scorers) > 0:
  print(
      f'Excluding {unsupported_scorers} scorers as they are not supported for'
      f' {organism}.'
  )
  for unsupported_scorer in unsupported_scorers:
    selected_scorers.remove(unsupported_scorer)


# Score variants in the VCF file.
results = []

for i, vcf_row in tqdm(vcf.iterrows(), total=len(vcf)):
  variant = genome.Variant(
      chromosome=str(vcf_row.CHROM),
      position=int(vcf_row.POS),
      reference_bases=vcf_row.REF,
      alternate_bases=vcf_row.ALT,
      name=vcf_row.variant_id,
  )
  interval = variant.reference_interval.resize(sequence_length)

  variant_scores = dna_model.score_variant(
      interval=interval,
      variant=variant,
      variant_scorers=selected_scorers,
      organism=organism,
  )
  df_scores = variant_scorers.tidy_scores([variant_scores])
  tissue_key = vcf_row.tissue
  df = df_scores[(df_scores['gene_id'] == vcf_row.gene_id) & (df_scores['gtex_tissue'] == tissue_key)]['raw_score']
  vcf_row['alphagenome_score'] = df.values.mean() if not df.empty else np.nan
  results.append(vcf_row)
  if i ==10:
    break

Hello Sayan,

Thank you for reaching out. That code looks ok, although this assumes the following: You are selecting scores from the RNA_SEQ variant scorer only and filtering to a tissue and gene of interest.

Thanks @Daniel_Scott for your response! I was using Alphagenome for some eqtl task and that’s why I took that approach.