import re
from lark import Lark, Token


#Parser
grammar=r"""
!start: (prompt | /[][():]/+)*
prompt: (emphasized | plain | comma | WHITESPACE)*
!emphasized: "(" prompt ")"
        | "(" prompt ":" [WHITESPACE] NUMBER [WHITESPACE] ")"
comma: ","
WHITESPACE: /\s+/
plain: /([^,\\\[\]():|]|\\.)+/
%import common.SIGNED_NUMBER -> NUMBER
"""

# Initialize the parser
parser = Lark(grammar, start='start')

# Function to extract tags
def extract_tags(tree):
    tags_with_positions = []
    def _traverse(node):
        if isinstance(node, Token) and node.type == '__ANON_1':
            tag_position = node.start_pos
            tag_text = node.value
            tags_with_positions.append((tag_text, tag_position, "tag"))
        elif not isinstance(node, Token):
            for child in node.children:
                _traverse(child)
    _traverse(tree)
    return tags_with_positions
    


def build_tag_offsets_dicts(new_image_tags_with_positions):
    # Structure the data for HighlightedText
    tag_data = []
    for tag_text, start_pos, nodetype in new_image_tags_with_positions:
        # Modify the tag
        modified_tag = tag_text.replace('_', ' ').replace('\\(', '(').replace('\\)', ')').strip()
        artist_matrix_tag = tag_text.replace('_', ' ').replace('\\(', '\(').replace('\\)', '\)').strip()
        tf_idf_matrix_tag = re.sub(r'\\([()])', r'\1', re.sub(r' ', '_', tag_text.strip().removeprefix('by ').removeprefix('by_')))
        # Calculate the end position based on the original tag length
        end_pos = start_pos + len(tag_text)
        # Append the structured data for each tag
        tag_data.append({
            "original_tag": tag_text,
            "start_pos": start_pos,
            "end_pos": end_pos,
            "modified_tag": modified_tag,
            "artist_matrix_tag": artist_matrix_tag,
            "tf_idf_matrix_tag": tf_idf_matrix_tag, 
            "node_type": nodetype
        })
    return tag_data
    
    
if __name__ == "__main__":
    print("prompt_grammar.py imports ok")