Spaces:

hugging-science
/

ESM2

Running

App Files Files Community

ESM2 / utils /handle_files.py

gabboud

replace fair-esm model access with huggingface hub, modularize and simplify post-processing

4dcb469 4 months ago

Raw

History Blame Contribute Delete

3.74 kB

	from pathlib import Path
	from Bio import SeqIO


	def parse_fasta_files(fasta_files):
	"""Parse one or multiple FASTA files and return sequences.
	This function uses the entire header line as sequence_id to deal with LigandMPNN's omittance of a unique sequence ID at the beginning of the header.

	Parameters:
	-----------
	fasta_files : list of str
	List of paths to FASTA files to be parsed.
	Returns:
	--------
	sequences : list of tuples
	A list of tuples containing (sequence_id, sequence, file_name) for each sequence found in the FASTA files.
	file_info : dict
	A dictionary mapping file names to lists of sequence IDs contained in each file.
	"""
	sequences = []
	file_info = {}

	for fasta_file in fasta_files:
	print(fasta_file)
	if fasta_file.endswith('.fasta') or fasta_file.endswith('.fa'):
	file_name = Path(fasta_file).stem
	file_seqs = []

	try:
	for record in SeqIO.parse(fasta_file, "fasta"):
	# Use the entire header as the sequence ID
	full_header = record.description # Full header line without '>'
	sequences.append((full_header, str(record.seq), file_name))
	file_seqs.append(full_header)
	file_info[file_name] = file_seqs
	except Exception as e:
	raise ValueError(f"Error parsing {fasta_file.name}: {str(e)}")

	if not sequences:
	raise ValueError("No sequences found in the provided FASTA files.")

	return sequences, file_info

	def parse_fasta_files_from_ligandmpnn(fasta_files):
	"""Parse one or multiple FASTA files and return sequences. These files are expected to be in the format generated by LigandMPNN.
	In these fasta files, there is no sequence_id in the header, It's the name of the file + some info on generated sequence quality + the number of the designs "id=0"
	Hence special parsing is needed to extract the sequence_id from the header.

	Parameters:
	-----------
	fasta_files : list of str
	List of paths to FASTA files to be parsed.
	Returns:
	--------
	sequences : list of tuples
	A list of tuples containing (sequence_id, sequence, file_name) for each sequence found in the FASTA files.
	file_info : dict
	A dictionary mapping file names to lists of sequence IDs contained in each file.
	"""
	sequences = []
	file_info = {}

	for fasta_file in fasta_files:
	print(fasta_file)
	if fasta_file.endswith('.fasta') or fasta_file.endswith('.fa'):
	file_name = Path(fasta_file).stem
	file_seqs = []

	try:
	for record in SeqIO.parse(fasta_file, "fasta"):
	# Extract id from description if it contains id=
	seq_id = record.id
	if "id=" in record.description:
	# Parse the description to find id=...
	parts = record.description.split()
	for part in parts:
	if part.startswith("id="):
	seq_id = part[3:] # Remove "id=" prefix
	break

	sequences.append((seq_id, str(record.seq), file_name))
	file_seqs.append(seq_id)
	file_info[file_name] = file_seqs
	except Exception as e:
	raise ValueError(f"Error parsing {fasta_file.name}: {str(e)}")

	if not sequences:
	raise ValueError("No sequences found in the provided FASTA files.")

	return sequences, file_info