Spaces:
Running
Running
File size: 2,089 Bytes
c6be992 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 | import re
from lark import Lark, Token
#Parser
grammar=r"""
!start: (prompt | /[][():]/+)*
prompt: (emphasized | plain | comma | WHITESPACE)*
!emphasized: "(" prompt ")"
| "(" prompt ":" [WHITESPACE] NUMBER [WHITESPACE] ")"
comma: ","
WHITESPACE: /\s+/
plain: /([^,\\\[\]():|]|\\.)+/
%import common.SIGNED_NUMBER -> NUMBER
"""
# Initialize the parser
parser = Lark(grammar, start='start')
# Function to extract tags
def extract_tags(tree):
tags_with_positions = []
def _traverse(node):
if isinstance(node, Token) and node.type == '__ANON_1':
tag_position = node.start_pos
tag_text = node.value
tags_with_positions.append((tag_text, tag_position, "tag"))
elif not isinstance(node, Token):
for child in node.children:
_traverse(child)
_traverse(tree)
return tags_with_positions
def build_tag_offsets_dicts(new_image_tags_with_positions):
# Structure the data for HighlightedText
tag_data = []
for tag_text, start_pos, nodetype in new_image_tags_with_positions:
# Modify the tag
modified_tag = tag_text.replace('_', ' ').replace('\\(', '(').replace('\\)', ')').strip()
artist_matrix_tag = tag_text.replace('_', ' ').replace('\\(', '\(').replace('\\)', '\)').strip()
tf_idf_matrix_tag = re.sub(r'\\([()])', r'\1', re.sub(r' ', '_', tag_text.strip().removeprefix('by ').removeprefix('by_')))
# Calculate the end position based on the original tag length
end_pos = start_pos + len(tag_text)
# Append the structured data for each tag
tag_data.append({
"original_tag": tag_text,
"start_pos": start_pos,
"end_pos": end_pos,
"modified_tag": modified_tag,
"artist_matrix_tag": artist_matrix_tag,
"tf_idf_matrix_tag": tf_idf_matrix_tag,
"node_type": nodetype
})
return tag_data
if __name__ == "__main__":
print("prompt_grammar.py imports ok")
|