from pathlib import Path
from Bio import SeqIO


def parse_fasta_files(fasta_files):
    """Parse one or multiple FASTA files and return sequences.
    This function uses the entire header line as sequence_id to deal with LigandMPNN's omittance of a unique sequence ID at the beginning of the header. 

    Parameters:
    -----------
    fasta_files : list of str
        List of paths to FASTA files to be parsed.
    Returns:
    --------    
    sequences : list of tuples
        A list of tuples containing (sequence_id, sequence, file_name) for each sequence found in the FASTA files.
    file_info : dict
        A dictionary mapping file names to lists of sequence IDs contained in each file.
    """
    sequences = []
    file_info = {}
    
    for fasta_file in fasta_files:
        print(fasta_file)
        if fasta_file.endswith('.fasta') or fasta_file.endswith('.fa'):
            file_name = Path(fasta_file).stem
            file_seqs = []
            
            try:
                for record in SeqIO.parse(fasta_file, "fasta"):
                    # Use the entire header as the sequence ID
                    full_header = record.description  # Full header line without '>'
                    sequences.append((full_header, str(record.seq), file_name))
                    file_seqs.append(full_header)
                file_info[file_name] = file_seqs
            except Exception as e:
                raise ValueError(f"Error parsing {fasta_file.name}: {str(e)}")
    
    if not sequences:
        raise ValueError("No sequences found in the provided FASTA files.")
    
    return sequences, file_info

def parse_fasta_files_from_ligandmpnn(fasta_files):
    """Parse one or multiple FASTA files and return sequences. These files are expected to be in the format generated by LigandMPNN.
    In these fasta files, there is no sequence_id in the header, It's the name of the file + some info on generated sequence quality + the number of the designs "id=0"
    Hence special parsing is needed to extract the sequence_id from the header. 

    Parameters:
    -----------
    fasta_files : list of str
        List of paths to FASTA files to be parsed.
    Returns:
    --------    
    sequences : list of tuples
        A list of tuples containing (sequence_id, sequence, file_name) for each sequence found in the FASTA files.
    file_info : dict
        A dictionary mapping file names to lists of sequence IDs contained in each file.
    """
    sequences = []
    file_info = {}

    for fasta_file in fasta_files:
        print(fasta_file)
        if fasta_file.endswith('.fasta') or fasta_file.endswith('.fa'):
            file_name = Path(fasta_file).stem
            file_seqs = []
            
            try:
                for record in SeqIO.parse(fasta_file, "fasta"):
                    # Extract id from description if it contains id=
                    seq_id = record.id
                    if "id=" in record.description:
                        # Parse the description to find id=...
                        parts = record.description.split()
                        for part in parts:
                            if part.startswith("id="):
                                seq_id = part[3:]  # Remove "id=" prefix
                                break
                    
                    sequences.append((seq_id, str(record.seq), file_name))
                    file_seqs.append(seq_id)
                file_info[file_name] = file_seqs
            except Exception as e:
                raise ValueError(f"Error parsing {fasta_file.name}: {str(e)}")

    if not sequences:
        raise ValueError("No sequences found in the provided FASTA files.")

    return sequences, file_info