ESM2 / utils /handle_files.py
gabboud's picture
replace fair-esm model access with huggingface hub, modularize and simplify post-processing
4dcb469
Raw
History Blame Contribute Delete
3.74 kB
from pathlib import Path
from Bio import SeqIO
def parse_fasta_files(fasta_files):
"""Parse one or multiple FASTA files and return sequences.
This function uses the entire header line as sequence_id to deal with LigandMPNN's omittance of a unique sequence ID at the beginning of the header.
Parameters:
-----------
fasta_files : list of str
List of paths to FASTA files to be parsed.
Returns:
--------
sequences : list of tuples
A list of tuples containing (sequence_id, sequence, file_name) for each sequence found in the FASTA files.
file_info : dict
A dictionary mapping file names to lists of sequence IDs contained in each file.
"""
sequences = []
file_info = {}
for fasta_file in fasta_files:
print(fasta_file)
if fasta_file.endswith('.fasta') or fasta_file.endswith('.fa'):
file_name = Path(fasta_file).stem
file_seqs = []
try:
for record in SeqIO.parse(fasta_file, "fasta"):
# Use the entire header as the sequence ID
full_header = record.description # Full header line without '>'
sequences.append((full_header, str(record.seq), file_name))
file_seqs.append(full_header)
file_info[file_name] = file_seqs
except Exception as e:
raise ValueError(f"Error parsing {fasta_file.name}: {str(e)}")
if not sequences:
raise ValueError("No sequences found in the provided FASTA files.")
return sequences, file_info
def parse_fasta_files_from_ligandmpnn(fasta_files):
"""Parse one or multiple FASTA files and return sequences. These files are expected to be in the format generated by LigandMPNN.
In these fasta files, there is no sequence_id in the header, It's the name of the file + some info on generated sequence quality + the number of the designs "id=0"
Hence special parsing is needed to extract the sequence_id from the header.
Parameters:
-----------
fasta_files : list of str
List of paths to FASTA files to be parsed.
Returns:
--------
sequences : list of tuples
A list of tuples containing (sequence_id, sequence, file_name) for each sequence found in the FASTA files.
file_info : dict
A dictionary mapping file names to lists of sequence IDs contained in each file.
"""
sequences = []
file_info = {}
for fasta_file in fasta_files:
print(fasta_file)
if fasta_file.endswith('.fasta') or fasta_file.endswith('.fa'):
file_name = Path(fasta_file).stem
file_seqs = []
try:
for record in SeqIO.parse(fasta_file, "fasta"):
# Extract id from description if it contains id=
seq_id = record.id
if "id=" in record.description:
# Parse the description to find id=...
parts = record.description.split()
for part in parts:
if part.startswith("id="):
seq_id = part[3:] # Remove "id=" prefix
break
sequences.append((seq_id, str(record.seq), file_name))
file_seqs.append(seq_id)
file_info[file_name] = file_seqs
except Exception as e:
raise ValueError(f"Error parsing {fasta_file.name}: {str(e)}")
if not sequences:
raise ValueError("No sequences found in the provided FASTA files.")
return sequences, file_info