Spaces:
Running
Running
| from pathlib import Path | |
| from Bio import SeqIO | |
| def parse_fasta_files(fasta_files): | |
| """Parse one or multiple FASTA files and return sequences. | |
| This function uses the entire header line as sequence_id to deal with LigandMPNN's omittance of a unique sequence ID at the beginning of the header. | |
| Parameters: | |
| ----------- | |
| fasta_files : list of str | |
| List of paths to FASTA files to be parsed. | |
| Returns: | |
| -------- | |
| sequences : list of tuples | |
| A list of tuples containing (sequence_id, sequence, file_name) for each sequence found in the FASTA files. | |
| file_info : dict | |
| A dictionary mapping file names to lists of sequence IDs contained in each file. | |
| """ | |
| sequences = [] | |
| file_info = {} | |
| for fasta_file in fasta_files: | |
| print(fasta_file) | |
| if fasta_file.endswith('.fasta') or fasta_file.endswith('.fa'): | |
| file_name = Path(fasta_file).stem | |
| file_seqs = [] | |
| try: | |
| for record in SeqIO.parse(fasta_file, "fasta"): | |
| # Use the entire header as the sequence ID | |
| full_header = record.description # Full header line without '>' | |
| sequences.append((full_header, str(record.seq), file_name)) | |
| file_seqs.append(full_header) | |
| file_info[file_name] = file_seqs | |
| except Exception as e: | |
| raise ValueError(f"Error parsing {fasta_file.name}: {str(e)}") | |
| if not sequences: | |
| raise ValueError("No sequences found in the provided FASTA files.") | |
| return sequences, file_info | |
| def parse_fasta_files_from_ligandmpnn(fasta_files): | |
| """Parse one or multiple FASTA files and return sequences. These files are expected to be in the format generated by LigandMPNN. | |
| In these fasta files, there is no sequence_id in the header, It's the name of the file + some info on generated sequence quality + the number of the designs "id=0" | |
| Hence special parsing is needed to extract the sequence_id from the header. | |
| Parameters: | |
| ----------- | |
| fasta_files : list of str | |
| List of paths to FASTA files to be parsed. | |
| Returns: | |
| -------- | |
| sequences : list of tuples | |
| A list of tuples containing (sequence_id, sequence, file_name) for each sequence found in the FASTA files. | |
| file_info : dict | |
| A dictionary mapping file names to lists of sequence IDs contained in each file. | |
| """ | |
| sequences = [] | |
| file_info = {} | |
| for fasta_file in fasta_files: | |
| print(fasta_file) | |
| if fasta_file.endswith('.fasta') or fasta_file.endswith('.fa'): | |
| file_name = Path(fasta_file).stem | |
| file_seqs = [] | |
| try: | |
| for record in SeqIO.parse(fasta_file, "fasta"): | |
| # Extract id from description if it contains id= | |
| seq_id = record.id | |
| if "id=" in record.description: | |
| # Parse the description to find id=... | |
| parts = record.description.split() | |
| for part in parts: | |
| if part.startswith("id="): | |
| seq_id = part[3:] # Remove "id=" prefix | |
| break | |
| sequences.append((seq_id, str(record.seq), file_name)) | |
| file_seqs.append(seq_id) | |
| file_info[file_name] = file_seqs | |
| except Exception as e: | |
| raise ValueError(f"Error parsing {fasta_file.name}: {str(e)}") | |
| if not sequences: | |
| raise ValueError("No sequences found in the provided FASTA files.") | |
| return sequences, file_info |