""" Visual Product Recommender: Gradio app for Hugging Face Spaces. Dataset: Shopify/product-catalogue. Model: openai/clip-vit-base-patch32 (CLIP). Tab 1 (Recommender): upload a product photo or type a description, get the 3 most visually similar products from a ~12K-item catalogue, ranked by cosine similarity over CLIP embeddings. Tab 2 (Dataset & Analysis): the EDA, embeddings, clustering plots and reasoning (the full writeup). Tab 3 (Presentation): the recorded walk-through video. """ import base64, io, os import numpy as np import pandas as pd from PIL import Image import torch import gradio as gr from transformers import CLIPModel, CLIPProcessor MODEL = "openai/clip-vit-base-patch32" DEVICE = "cuda" if torch.cuda.is_available() else "cpu" TEXT_TEMPLATE = "a product photo of {}" # CLIP retrieves better with a short prompt template DUP = 0.985 # drop near-duplicate results print("loading catalog...") catalog = pd.read_parquet("catalog.parquet") EMB = np.array(catalog["embedding"].tolist(), dtype="float32") # L2-normalized 512-d vectors print("loading CLIP...") model = CLIPModel.from_pretrained(MODEL).to(DEVICE).eval() proc = CLIPProcessor.from_pretrained(MODEL) @torch.no_grad() def encode_text(q): inp = proc(text=[TEXT_TEMPLATE.format(q)], return_tensors="pt", padding=True, truncation=True).to(DEVICE) t = model.text_model(input_ids=inp["input_ids"], attention_mask=inp["attention_mask"]) f = model.text_projection(t.pooler_output) return (f / f.norm(dim=-1, keepdim=True)).cpu().numpy()[0] @torch.no_grad() def encode_image(img): inp = proc(images=img.convert("RGB"), return_tensors="pt").to(DEVICE) v = model.vision_model(pixel_values=inp["pixel_values"]) f = model.visual_projection(v.pooler_output) return (f / f.norm(dim=-1, keepdim=True)).cpu().numpy()[0] def top_matches(qvec, k=3): sims = EMB @ qvec # cosine (vectors are L2-normalized) order = np.argsort(-sims) chosen = [] for idx in order: if any(float(EMB[idx] @ EMB[j]) > DUP for j in chosen): continue chosen.append(int(idx)) if len(chosen) == k: break return [(i, float(sims[i])) for i in chosen] def b64_to_img(b): return Image.open(io.BytesIO(base64.b64decode(b))).convert("RGB") def render(results): out = [] for rank, (i, s) in enumerate(results, 1): row = catalog.iloc[i] cap = f"#{rank} Β· similarity {s:.2f} Β· {row['top_category']}\n{row['title'][:70]}" out.append((b64_to_img(row["thumb"]), cap)) return out def recommend(text, image): if image is not None: qvec = encode_image(image) mode = "image" elif text and text.strip(): qvec = encode_text(text.strip()) mode = "text" else: return [], "Upload a product photo or type a description to get recommendations." res = top_matches(qvec, k=3) return render(res), f"Top 3 products most similar to your {mode} query (each result shows its cosine similarity)." def load_plot(name): p = os.path.join("assets", name) return Image.open(p).convert("RGB") if os.path.exists(p) else None EX_DIR = "examples" img_examples = [[os.path.join(EX_DIR, f)] for f in ["cameras.jpg", "furniture.jpg", "animals.jpg", "sporting.jpg", "electronics.jpg"] if os.path.exists(os.path.join(EX_DIR, f))] with gr.Blocks(title="Visual Product Recommender", theme=gr.themes.Soft()) as demo: gr.Markdown( "# πŸ›οΈ Visual Product Recommender\n" "Assignment #3: Embeddings, RecSys, Spaces. A recommendation app on the vision modality, " "built on the `Shopify/product-catalogue` dataset with CLIP embeddings." ) with gr.Tabs(): # ---------------------------------------------------------------- TAB 1 with gr.Tab("πŸ›οΈ Recommender"): gr.Markdown( "Find the 3 most visually similar products in a ~12K-item catalogue. Every product " "image is a CLIP embedding; your query (a photo *or* text) is embedded into the same " "space and ranked by cosine similarity. Uploading a photo is the strongest mode." ) with gr.Row(): with gr.Column(scale=1): img_in = gr.Image(label="β‘  Upload a product photo (best results)", type="pil") if img_examples: gr.Examples(img_examples, inputs=img_in, label="Click an example image") text_in = gr.Textbox(label="β‘‘ …or describe a product", placeholder="e.g. camera lens") gr.Examples( ["camera lens", "helmet", "sofa", "dog leash", "sunglasses", "wooden table"], inputs=text_in, label="Click an example query", ) btn = gr.Button("Recommend 3 products", variant="primary") note = gr.Markdown("") with gr.Column(scale=2): gallery = gr.Gallery(label="Recommendations", columns=3, height=400, object_fit="contain") gr.Markdown( "*Image upload is the reliable mode; text search is best-effort and depends on the item existing in " "the catalogue. An uploaded image takes priority over the text box, so clear it to run a text query.*" ) btn.click(recommend, [text_in, img_in], [gallery, note]) text_in.submit(recommend, [text_in, img_in], [gallery, note]) # ---------------------------------------------------------------- TAB 2 with gr.Tab("πŸ“Š Dataset & Analysis"): gr.Markdown( "## 1. Dataset\n" "Source: [`Shopify/product-catalogue`](https://huggingface.co/datasets/Shopify/product-catalogue), " "loaded directly from Hugging Face with `datasets.load_dataset`.\n\n" "Size: 48,289 products (train 38,631 + test 9,658); I work with the train split and embed a " "balanced ~12K stratified sample (11,912 products) so no category dominates. Median image is " "~900x900 px. Features: title, description, image, category (Google product taxonomy, depth ~4), " "brand (24K, long tail), second-hand flag.\n\n" "## 2. Exploratory Data Analysis\n" "Because this is an image dataset, the EDA covers both the metadata and the images themselves. " "Categories are imbalanced (hence the balanced sampling); brands are a long tail; titles are short " "and often multilingual." ) with gr.Row(): gr.Image(load_plot("eda_categories.png"), label="Top-level product categories", show_label=True) gr.Image(load_plot("eda_brands.png"), label="Top 15 brands (long tail)", show_label=True) with gr.Row(): gr.Image(load_plot("eda_text_lengths.png"), label="Title / description length", show_label=True) gr.Image(load_plot("eda_sample_grid.png"), label="Sample product per category", show_label=True) gr.Markdown( "Image properties. The images are mostly square, bright studio shots, and about 70% have a " "near-white background. This is the key finding: white backgrounds make products from different " "categories look alike, which explains the modest clustering silhouette below and why the recommender " "works best on clean single-product photos." ) with gr.Row(): gr.Image(load_plot("eda_image_dims.png"), label="Image width / height / aspect ratio", show_label=True) gr.Image(load_plot("eda_image_color.png"), label="Brightness and composition (70% white-bg)", show_label=True) gr.Markdown( "## 3. Embeddings and Clustering\n" "Model: [`openai/clip-vit-base-patch32`](https://huggingface.co/openai/clip-vit-base-patch32), " "a small/medium CLIP model that embeds images and text into one shared 512-d space (this is what " "lets the app accept both a photo and a text query). Each image becomes an L2-normalized vector.\n\n" "Clustering: I reduce to 50 PCA components (denoising), run K-Means, and pick K by the " "silhouette score (K=4). The four clusters are interpretable visual product families: " "consumables / tech and hardware / furnishings and soft goods / toys, office and media." ) with gr.Row(): gr.Image(load_plot("silhouette.png"), label="K-Means model selection (silhouette)", show_label=True) gr.Image(load_plot("cluster_category_heatmap.png"), label="Cluster composition by category", show_label=True) with gr.Row(): gr.Image(load_plot("umap_category.png"), label="UMAP colored by category", show_label=True) gr.Image(load_plot("umap_cluster.png"), label="UMAP colored by K-Means cluster", show_label=True) gr.Markdown( "Cluster reasoning: clusters found from purely visual embeddings (no labels) still line up with " "human categories, as the heatmap shows. The silhouette is modest because most products sit on white " "studio backgrounds and overlap visually, but the structure is real, which is what makes " "nearest-neighbour recommendation work.\n\n" "Going deeper. I cross-checked with a second projection (t-SNE) and a second clustering " "algorithm (DBSCAN, eps chosen from a k-distance plot), and looked at the actual products closest " "to each cluster centroid." ) with gr.Row(): gr.Image(load_plot("tsne_category.png"), label="t-SNE (second projection)", show_label=True) gr.Image(load_plot("dbscan.png"), label="DBSCAN (second clustering algorithm)", show_label=True) gr.Image(load_plot("cluster_examples.png"), label="Representative products per cluster", show_label=True) gr.Markdown( "## 4. Recommender and Evaluation\n" "Embeddings are saved to `catalog.parquet`. A query is encoded with the same CLIP model, scored by " "cosine similarity against the catalogue, and the Top-3 are returned (near-duplicates filtered). " "In the notebook I also benchmark a FAISS index (the standard vector-search library) as a scaling " "option: about 50x faster than the brute-force scan with identical results. On 80 held-out products, " "image-to-image retrieval reaches precision@1 β‰ˆ 0.39, about 3x the random baseline. Example queries " "and their Top-3:" ) gr.Image(load_plot("recommend_examples.png"), label="Query (held-out photo) to Top-3", show_label=True) gr.Markdown( "Business & ethics: visual search powers 'shop the look' features, needs no manual tags, and works " "across languages, but it matches *appearance not function* (avoid for safety-critical items), can inherit " "CLIP's web-image biases, and can only recommend products that exist in the catalogue. Full coding work is " "in the notebook (Files tab, `Assignment_3_NoamFuchs.ipynb`)." ) # ---------------------------------------------------------------- TAB 3 with gr.Tab("πŸŽ₯ Presentation"): if os.path.exists("presentation.mp4"): gr.Video("presentation.mp4", label="Presentation walk-through") else: gr.Markdown("The presentation video walk-through will be embedded here.") if __name__ == "__main__": demo.launch()