import gradio as gr import torch import numpy as np from Bio import SeqIO import tempfile import os import json from pathlib import Path import zipfile import spaces from utils.download_models import * from utils.handle_files import parse_fasta_files from utils.pipelines import generate_embeddings, full_embedding_pipeline print("Downloading ESM2 models...") MODELS = { "facebook/esm2_t6_8M_UR50D": "ESM2-8M", "facebook/esm2_t12_35M_UR50D": "ESM2-35M", #"esm2_t36_650M_UR50D": "ESM2-650M" } cache_dirs = cache_all_models(MODELS) models_and_tokenizers = load_all_models(MODELS) # Create Gradio interface with gr.Blocks(title="ESM2 Protein Embeddings") as demo: gr.Markdown(""" # ESM2 Protein Sequence Embeddings Generate embeddings for protein sequences using Meta's ESM2 language model. **Features:** - Process one or multiple FASTA files - Generate high-dimensional embeddings (1280-D) using ESM2-650M - Download embeddings in NumPy format or as JSON metadata - Supports batch processing for efficiency **Instructions:** 1. Upload one or more FASTA files containing protein sequences 2. Click "Generate Embeddings" 3. Download the output files (embeddings.npz, metadata.json, summary.txt) **Output Files:** - `embeddings.npz`: Compressed NumPy file with all embeddings - `metadata.json`: JSON file with sequence IDs and metadata - `summary.txt`: Human-readable summary - `embeddings_[filename].npz`: Per-file embeddings """) with gr.Row(): with gr.Column(): input_files = gr.File( label="Upload FASTA files", file_count="multiple", file_types=[".fasta", ".fa", ".faa"] ) submit_btn = gr.Button("Generate Embeddings", variant="primary", size="lg") with gr.Column(): status_output = gr.Textbox( label="Processing Status", interactive=False, lines=6 ) with gr.Row(): download_output = gr.File( label="Download Output Files", file_count="multiple" ) with gr.Row(): model_dropdown = gr.Dropdown( choices=list(MODELS.values()), value=list(MODELS.values())[0], label="Select Model" ) model_to_use = gr.State(value=models_and_tokenizers[model_dropdown.value][0]) tokenizer_to_use = gr.State(value=models_and_tokenizers[model_dropdown.value][1]) def pick_model(model_name): model_key = [key for key, value in MODELS.items() if value == model_name][0] print(f"Selected model: {model_name} ({model_key})") return models_and_tokenizers[model_key] model_dropdown.change( fn=pick_model, inputs=model_dropdown, outputs=[model_to_use, tokenizer_to_use] ) submit_btn.click( fn=full_embedding_pipeline, inputs=[input_files], outputs=[download_output, status_output] ) gr.Markdown(""" ### How to use the embeddings: ```python import numpy as np import json # Load embeddings embeddings = np.load('embeddings.npz') # Access a specific embedding embedding = embeddings['file_name_sequence_id'] # Load metadata with open('metadata.json', 'r') as f: metadata = json.load(f) ``` """) if __name__ == "__main__": demo.launch()