Prompt_Squirrel_RAG / psq_rag /parsing /prompt_grammar.py
Food Desert
Add alias-based character tag filtering for Stage 3
c6be992
Raw
History Blame
2.09 kB
import re
from lark import Lark, Token
#Parser
grammar=r"""
!start: (prompt | /[][():]/+)*
prompt: (emphasized | plain | comma | WHITESPACE)*
!emphasized: "(" prompt ")"
| "(" prompt ":" [WHITESPACE] NUMBER [WHITESPACE] ")"
comma: ","
WHITESPACE: /\s+/
plain: /([^,\\\[\]():|]|\\.)+/
%import common.SIGNED_NUMBER -> NUMBER
"""
# Initialize the parser
parser = Lark(grammar, start='start')
# Function to extract tags
def extract_tags(tree):
tags_with_positions = []
def _traverse(node):
if isinstance(node, Token) and node.type == '__ANON_1':
tag_position = node.start_pos
tag_text = node.value
tags_with_positions.append((tag_text, tag_position, "tag"))
elif not isinstance(node, Token):
for child in node.children:
_traverse(child)
_traverse(tree)
return tags_with_positions
def build_tag_offsets_dicts(new_image_tags_with_positions):
# Structure the data for HighlightedText
tag_data = []
for tag_text, start_pos, nodetype in new_image_tags_with_positions:
# Modify the tag
modified_tag = tag_text.replace('_', ' ').replace('\\(', '(').replace('\\)', ')').strip()
artist_matrix_tag = tag_text.replace('_', ' ').replace('\\(', '\(').replace('\\)', '\)').strip()
tf_idf_matrix_tag = re.sub(r'\\([()])', r'\1', re.sub(r' ', '_', tag_text.strip().removeprefix('by ').removeprefix('by_')))
# Calculate the end position based on the original tag length
end_pos = start_pos + len(tag_text)
# Append the structured data for each tag
tag_data.append({
"original_tag": tag_text,
"start_pos": start_pos,
"end_pos": end_pos,
"modified_tag": modified_tag,
"artist_matrix_tag": artist_matrix_tag,
"tf_idf_matrix_tag": tf_idf_matrix_tag,
"node_type": nodetype
})
return tag_data
if __name__ == "__main__":
print("prompt_grammar.py imports ok")