Spaces:

FoodDesert
/

Prompt_Squirrel_RAG

Running

Prompt_Squirrel_RAG / psq_rag /parsing /prompt_grammar.py

Food Desert

Add alias-based character tag filtering for Stage 3

c6be992 4 months ago

2.09 kB

	import re
	from lark import Lark, Token


	#Parser
	grammar=r"""
	!start: (prompt \| /[][():]/+)*
	prompt: (emphasized \| plain \| comma \| WHITESPACE)*
	!emphasized: "(" prompt ")"
	\| "(" prompt ":" [WHITESPACE] NUMBER [WHITESPACE] ")"
	comma: ","
	WHITESPACE: /\s+/
	plain: /([^,\\\[\]():\|]\|\\.)+/
	%import common.SIGNED_NUMBER -> NUMBER
	"""

	# Initialize the parser
	parser = Lark(grammar, start='start')

	# Function to extract tags
	def extract_tags(tree):
	tags_with_positions = []
	def _traverse(node):
	if isinstance(node, Token) and node.type == '__ANON_1':
	tag_position = node.start_pos
	tag_text = node.value
	tags_with_positions.append((tag_text, tag_position, "tag"))
	elif not isinstance(node, Token):
	for child in node.children:
	_traverse(child)
	_traverse(tree)
	return tags_with_positions



	def build_tag_offsets_dicts(new_image_tags_with_positions):
	# Structure the data for HighlightedText
	tag_data = []
	for tag_text, start_pos, nodetype in new_image_tags_with_positions:
	# Modify the tag
	modified_tag = tag_text.replace('_', ' ').replace('\\(', '(').replace('\\)', ')').strip()
	artist_matrix_tag = tag_text.replace('_', ' ').replace('\\(', '\(').replace('\\)', '\)').strip()
	tf_idf_matrix_tag = re.sub(r'\\([()])', r'\1', re.sub(r' ', '_', tag_text.strip().removeprefix('by ').removeprefix('by_')))
	# Calculate the end position based on the original tag length
	end_pos = start_pos + len(tag_text)
	# Append the structured data for each tag
	tag_data.append({
	"original_tag": tag_text,
	"start_pos": start_pos,
	"end_pos": end_pos,
	"modified_tag": modified_tag,
	"artist_matrix_tag": artist_matrix_tag,
	"tf_idf_matrix_tag": tf_idf_matrix_tag,
	"node_type": nodetype
	})
	return tag_data


	if __name__ == "__main__":
	print("prompt_grammar.py imports ok")