import re from lark import Lark, Token #Parser grammar=r""" !start: (prompt | /[][():]/+)* prompt: (emphasized | plain | comma | WHITESPACE)* !emphasized: "(" prompt ")" | "(" prompt ":" [WHITESPACE] NUMBER [WHITESPACE] ")" comma: "," WHITESPACE: /\s+/ plain: /([^,\\\[\]():|]|\\.)+/ %import common.SIGNED_NUMBER -> NUMBER """ # Initialize the parser parser = Lark(grammar, start='start') # Function to extract tags def extract_tags(tree): tags_with_positions = [] def _traverse(node): if isinstance(node, Token) and node.type == '__ANON_1': tag_position = node.start_pos tag_text = node.value tags_with_positions.append((tag_text, tag_position, "tag")) elif not isinstance(node, Token): for child in node.children: _traverse(child) _traverse(tree) return tags_with_positions def build_tag_offsets_dicts(new_image_tags_with_positions): # Structure the data for HighlightedText tag_data = [] for tag_text, start_pos, nodetype in new_image_tags_with_positions: # Modify the tag modified_tag = tag_text.replace('_', ' ').replace('\\(', '(').replace('\\)', ')').strip() artist_matrix_tag = tag_text.replace('_', ' ').replace('\\(', '\(').replace('\\)', '\)').strip() tf_idf_matrix_tag = re.sub(r'\\([()])', r'\1', re.sub(r' ', '_', tag_text.strip().removeprefix('by ').removeprefix('by_'))) # Calculate the end position based on the original tag length end_pos = start_pos + len(tag_text) # Append the structured data for each tag tag_data.append({ "original_tag": tag_text, "start_pos": start_pos, "end_pos": end_pos, "modified_tag": modified_tag, "artist_matrix_tag": artist_matrix_tag, "tf_idf_matrix_tag": tf_idf_matrix_tag, "node_type": nodetype }) return tag_data if __name__ == "__main__": print("prompt_grammar.py imports ok")