File size: 2,089 Bytes
c6be992
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import re
from lark import Lark, Token


#Parser
grammar=r"""

!start: (prompt | /[][():]/+)*

prompt: (emphasized | plain | comma | WHITESPACE)*

!emphasized: "(" prompt ")"

        | "(" prompt ":" [WHITESPACE] NUMBER [WHITESPACE] ")"

comma: ","

WHITESPACE: /\s+/

plain: /([^,\\\[\]():|]|\\.)+/

%import common.SIGNED_NUMBER -> NUMBER

"""

# Initialize the parser
parser = Lark(grammar, start='start')

# Function to extract tags
def extract_tags(tree):
    tags_with_positions = []
    def _traverse(node):
        if isinstance(node, Token) and node.type == '__ANON_1':
            tag_position = node.start_pos
            tag_text = node.value
            tags_with_positions.append((tag_text, tag_position, "tag"))
        elif not isinstance(node, Token):
            for child in node.children:
                _traverse(child)
    _traverse(tree)
    return tags_with_positions
    


def build_tag_offsets_dicts(new_image_tags_with_positions):
    # Structure the data for HighlightedText
    tag_data = []
    for tag_text, start_pos, nodetype in new_image_tags_with_positions:
        # Modify the tag
        modified_tag = tag_text.replace('_', ' ').replace('\\(', '(').replace('\\)', ')').strip()
        artist_matrix_tag = tag_text.replace('_', ' ').replace('\\(', '\(').replace('\\)', '\)').strip()
        tf_idf_matrix_tag = re.sub(r'\\([()])', r'\1', re.sub(r' ', '_', tag_text.strip().removeprefix('by ').removeprefix('by_')))
        # Calculate the end position based on the original tag length
        end_pos = start_pos + len(tag_text)
        # Append the structured data for each tag
        tag_data.append({
            "original_tag": tag_text,
            "start_pos": start_pos,
            "end_pos": end_pos,
            "modified_tag": modified_tag,
            "artist_matrix_tag": artist_matrix_tag,
            "tf_idf_matrix_tag": tf_idf_matrix_tag, 
            "node_type": nodetype
        })
    return tag_data
    
    
if __name__ == "__main__":
    print("prompt_grammar.py imports ok")