from pathlib import Path from Bio import SeqIO def parse_fasta_files(fasta_files): """Parse one or multiple FASTA files and return sequences. This function uses the entire header line as sequence_id to deal with LigandMPNN's omittance of a unique sequence ID at the beginning of the header. Parameters: ----------- fasta_files : list of str List of paths to FASTA files to be parsed. Returns: -------- sequences : list of tuples A list of tuples containing (sequence_id, sequence, file_name) for each sequence found in the FASTA files. file_info : dict A dictionary mapping file names to lists of sequence IDs contained in each file. """ sequences = [] file_info = {} for fasta_file in fasta_files: print(fasta_file) if fasta_file.endswith('.fasta') or fasta_file.endswith('.fa'): file_name = Path(fasta_file).stem file_seqs = [] try: for record in SeqIO.parse(fasta_file, "fasta"): # Use the entire header as the sequence ID full_header = record.description # Full header line without '>' sequences.append((full_header, str(record.seq), file_name)) file_seqs.append(full_header) file_info[file_name] = file_seqs except Exception as e: raise ValueError(f"Error parsing {fasta_file.name}: {str(e)}") if not sequences: raise ValueError("No sequences found in the provided FASTA files.") return sequences, file_info def parse_fasta_files_from_ligandmpnn(fasta_files): """Parse one or multiple FASTA files and return sequences. These files are expected to be in the format generated by LigandMPNN. In these fasta files, there is no sequence_id in the header, It's the name of the file + some info on generated sequence quality + the number of the designs "id=0" Hence special parsing is needed to extract the sequence_id from the header. Parameters: ----------- fasta_files : list of str List of paths to FASTA files to be parsed. Returns: -------- sequences : list of tuples A list of tuples containing (sequence_id, sequence, file_name) for each sequence found in the FASTA files. file_info : dict A dictionary mapping file names to lists of sequence IDs contained in each file. """ sequences = [] file_info = {} for fasta_file in fasta_files: print(fasta_file) if fasta_file.endswith('.fasta') or fasta_file.endswith('.fa'): file_name = Path(fasta_file).stem file_seqs = [] try: for record in SeqIO.parse(fasta_file, "fasta"): # Extract id from description if it contains id= seq_id = record.id if "id=" in record.description: # Parse the description to find id=... parts = record.description.split() for part in parts: if part.startswith("id="): seq_id = part[3:] # Remove "id=" prefix break sequences.append((seq_id, str(record.seq), file_name)) file_seqs.append(seq_id) file_info[file_name] = file_seqs except Exception as e: raise ValueError(f"Error parsing {fasta_file.name}: {str(e)}") if not sequences: raise ValueError("No sequences found in the provided FASTA files.") return sequences, file_info