Spaces:
Running
Running
| import gradio as gr | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| import numpy as np | |
| from joblib import load | |
| import h5py | |
| from io import BytesIO | |
| import csv | |
| import re | |
| import random | |
| import compress_fasttext | |
| from collections import OrderedDict | |
| from lark import Lark | |
| from lark import Token | |
| faq_content=""" | |
| # Frequently Asked Questions (FAQs) | |
| Technically I am writing this before anyone but me has used the tool, so no one has asked questions yet. But if they did, here are the questions I think they might ask: | |
| ## Does input order matter? | |
| No | |
| ## Should I use underscores in the input tags? | |
| It doesn't matter. The application handles tags either way. | |
| ## Why are some valid tags marked as "unseen", and why don't some artists ever get returned? | |
| Some data is excluded from consideration if it did not occur frequently enough in the sample from which the application makes its calculations. | |
| If an artist or tag is too infrequent, we might not think we have enough data to make predictions about it. | |
| ## Are there any special tags? | |
| Yes. We normalized the favorite counts of each image to a range of 0-9, with 0 being the lowest favcount, and 9 being the highest. | |
| You can include any of these special tags: "score:0", "score:1", "score:2", "score:3", "score:4", "score:5", "score:6", "score:7", "score:8", "score:9" | |
| in your list to bias the output toward artists with higher or lower scoring images. | |
| ## Are there any other special tricks? | |
| Yes. If you want to more strongly bias the artist output toward a specific tag, you can just list it multiple times. | |
| So for example, the query "red fox, red fox, red fox, score:7" will yield a list of artists who are more strongly associated with the tag "red fox" | |
| than the query "red fox, score:7". | |
| ## What calculation is this thing actually performing? | |
| Each artist is represented by a "pseudo-document" composed of all the tags from their uploaded images, treating these tags similarly to words in a text document. | |
| Similarly, when you input a set of tags, the system creates a pseudo-document for your query out of all the tags. | |
| It then uses a technique called cosine similarity to compare your tags against each artist's collection, essentially finding which artist's tags are most "similar" to yours. | |
| This method helps identify artists whose work is closely aligned with the themes or elements you're interested in. | |
| For those curious about the underlying mechanics of comparing text-like data, we employ the TF-IDF (Term Frequency-Inverse Document Frequency) method, a standard approach in information retrieval. | |
| You can read more about TF-IDF on its [Wikipedia page](https://en.wikipedia.org/wiki/Tf%E2%80%93idf). | |
| """ | |
| grammar=r""" | |
| !start: (prompt | /[][():]/+)* | |
| prompt: (emphasized | plain | comma | WHITESPACE)* | |
| !emphasized: "(" prompt ")" | |
| | "(" prompt ":" [WHITESPACE] NUMBER [WHITESPACE] ")" | |
| comma: "," | |
| WHITESPACE: /\s+/ | |
| plain: /([^,\\\[\]():|]|\\.)+/ | |
| %import common.SIGNED_NUMBER -> NUMBER | |
| """ | |
| # Initialize the parser | |
| parser = Lark(grammar, start='start') | |
| # Function to extract tags | |
| def extract_tags(tree): | |
| tags = [] | |
| def _traverse(node): | |
| if isinstance(node, Token) and node.type == '__ANON_1': | |
| tags.append(node.value.strip()) | |
| elif not isinstance(node, Token): | |
| for child in node.children: | |
| _traverse(child) | |
| _traverse(tree) | |
| return tags | |
| # Load the model and data once at startup | |
| with h5py.File('complete_artist_data.hdf5', 'r') as f: | |
| # Deserialize the vectorizer | |
| vectorizer_bytes = f['vectorizer'][()].tobytes() | |
| vectorizer_buffer = BytesIO(vectorizer_bytes) | |
| vectorizer = load(vectorizer_buffer) | |
| # Load X_artist | |
| X_artist = f['X_artist'][:] | |
| # Load artist names and decode to strings | |
| artist_names = [name.decode() for name in f['artist_names'][:]] | |
| def clean_tag(tag): | |
| return ''.join(char for char in tag if ord(char) < 128) | |
| #Normally returns tag to aliases, but when reverse=True, returns alias to tags | |
| def build_aliases_dict(filename, reverse=False): | |
| aliases_dict = {} | |
| with open(filename, 'r', newline='', encoding='utf-8') as csvfile: | |
| reader = csv.reader(csvfile) | |
| for row in reader: | |
| tag = clean_tag(row[0]) | |
| alias_list = [] if row[3] == "null" else [clean_tag(alias) for alias in row[3].split(',')] | |
| if reverse: | |
| for alias in alias_list: | |
| aliases_dict.setdefault(alias, []).append(tag) | |
| else: | |
| aliases_dict[tag] = alias_list | |
| return aliases_dict | |
| def find_similar_tags(test_tags): | |
| #Initialize stuff | |
| if not hasattr(find_similar_tags, "fasttext_small_model"): | |
| find_similar_tags.fasttext_small_model = compress_fasttext.models.CompressedFastTextKeyedVectors.load('e621FastTextModel010Replacement_small.bin') | |
| tag_aliases_file = 'fluffyrock_3m.csv' | |
| if not hasattr(find_similar_tags, "tag2aliases"): | |
| find_similar_tags.tag2aliases = build_aliases_dict(tag_aliases_file) | |
| if not hasattr(find_similar_tags, "alias2tags"): | |
| find_similar_tags.alias2tags = build_aliases_dict(tag_aliases_file, reverse=True) | |
| # Find similar tags and prepare data for dataframe. | |
| results_data = [] | |
| for tag in test_tags: | |
| modified_tag_for_search = tag.replace(' ','_') | |
| similar_words = find_similar_tags.fasttext_small_model.most_similar(modified_tag_for_search) | |
| result, seen = [], set() | |
| if modified_tag_for_search in find_similar_tags.tag2aliases: | |
| if tag in find_similar_tags.tag2aliases and "_" in tag: #Implicitly tell the user that they should get rid of the underscore | |
| result.append(modified_tag_for_search.replace('_',' '), 1) | |
| seen.add(tag) | |
| else: #The user correctly did not put underscores in their tag | |
| continue | |
| else: | |
| for item in similar_words: | |
| similar_word, similarity = item | |
| if similar_word not in seen: | |
| if similar_word in find_similar_tags.tag2aliases: | |
| result.append((similar_word.replace('_', ' '), round(similarity, 3))) | |
| seen.add(similar_word) | |
| else: | |
| for similar_tag in find_similar_tags.alias2tags.get(similar_word, []): | |
| if similar_tag not in seen: | |
| result.append((similar_tag.replace('_', ' '), round(similarity, 3))) | |
| seen.add(similar_tag) | |
| # Append tag and formatted similar tags to results_data | |
| first_entry_for_tag = True | |
| for word, sim in result: | |
| if first_entry_for_tag: | |
| results_data.append([tag, word, sim]) | |
| first_entry_for_tag = False | |
| else: | |
| results_data.append(["", word, sim]) | |
| results_data.append(["", "", ""]) # Adds a blank line after each group of tags | |
| if not results_data: | |
| results_data.append(["No Unknown Tags Found", "", ""]) | |
| return results_data # Return list of lists for Dataframe | |
| def find_similar_artists(new_tags_string, top_n): | |
| # Parse the prompt | |
| parsed = parser.parse(new_tags_string) | |
| # Extract tags from the parsed tree | |
| new_image_tags = extract_tags(parsed) | |
| new_image_tags = [tag.replace('_', ' ').strip() for tag in new_image_tags] | |
| ###unseen_tags = list(set(OrderedDict.fromkeys(new_image_tags)) - set(vectorizer.vocabulary_.keys())) | |
| unseen_tags_data = find_similar_tags(new_image_tags) | |
| X_new_image = vectorizer.transform([','.join(new_image_tags)]) | |
| similarities = cosine_similarity(X_new_image, X_artist)[0] | |
| top_artist_indices = np.argsort(similarities)[-top_n:][::-1] | |
| top_artists = [(artist_names[i], similarities[i]) for i in top_artist_indices] | |
| top_artists_str = "\n".join([f"{rank+1}. {artist[3:]} ({score:.4f})" for rank, (artist, score) in enumerate(top_artists)]) | |
| dynamic_prompts_formatted_artists = "{" + "|".join([artist for artist, _ in top_artists]) + "}" | |
| return unseen_tags_data, top_artists_str, dynamic_prompts_formatted_artists | |
| iface = gr.Interface( | |
| fn=find_similar_artists, | |
| inputs=[ | |
| gr.Textbox(label="Enter image tags", placeholder="e.g. fox, outside, detailed background, ..."), | |
| gr.Slider(minimum=1, maximum=100, value=10, step=1, label="Number of artists") | |
| ], | |
| outputs=[ | |
| gr.Dataframe(label="Unseen Tags", headers=["Tag", "Similar Tags", "Similarity"]), | |
| gr.Textbox(label="Top Artists", info="These are the artists most strongly associated with your tags. The number in parenthes is a similarity score between 0 and 1, with higher numbers indicating greater similarity."), | |
| gr.Textbox(label="Dynamic Prompts Format", info="For if you're using the Automatic1111 webui (https://github.com/AUTOMATIC1111/stable-diffusion-webui) with the Dynamic Prompts extension activated (https://github.com/adieyal/sd-dynamic-prompts) and want to try them all individually.") | |
| ], | |
| title="Tagset Completer", | |
| description="Enter a list of comma-separated e6 tags", | |
| article=faq_content | |
| ) | |
| iface.launch() | |