""" SmolLM2-360M-Instruct Architecture Analysis For 8bit-threshold-computer LLM Integration Project """ import torch from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig from collections import defaultdict import json def analyze_smollm2(): model_name = "HuggingFaceTB/SmolLM2-360M-Instruct" print("=" * 80) print("SmolLM2-360M-Instruct Architecture Analysis") print("=" * 80) # Load config first print("\n[1] Loading model configuration...") config = AutoConfig.from_pretrained(model_name) print(f"Config loaded: {type(config).__name__}") # Load tokenizer print("\n[2] Loading tokenizer...") tokenizer = AutoTokenizer.from_pretrained(model_name) print(f"Tokenizer loaded: {type(tokenizer).__name__}") # Load model with hidden states output print("\n[3] Loading model with output_hidden_states=True...") model = AutoModelForCausalLM.from_pretrained( model_name, output_hidden_states=True, torch_dtype=torch.float32 ) model.eval() print(f"Model loaded: {type(model).__name__}") # ======================================================================== # ARCHITECTURE CENSUS # ======================================================================== print("\n" + "=" * 80) print("ARCHITECTURE CENSUS") print("=" * 80) print("\n--- Model Configuration ---") config_dict = config.to_dict() for key, value in sorted(config_dict.items()): print(f" {key}: {value}") print("\n--- Key Architecture Parameters ---") print(f" Model type: {config.model_type}") print(f" Vocabulary size: {config.vocab_size}") print(f" Hidden size: {config.hidden_size}") print(f" Intermediate size: {config.intermediate_size}") print(f" Number of hidden layers: {config.num_hidden_layers}") print(f" Number of attention heads: {config.num_attention_heads}") print(f" Number of KV heads: {getattr(config, 'num_key_value_heads', config.num_attention_heads)}") print(f" Head dimension: {config.hidden_size // config.num_attention_heads}") print(f" Max position embeddings: {config.max_position_embeddings}") print(f" RMS norm epsilon: {getattr(config, 'rms_norm_eps', 'N/A')}") print(f" Rope theta: {getattr(config, 'rope_theta', 'N/A')}") print(f" Tie word embeddings: {getattr(config, 'tie_word_embeddings', 'N/A')}") # ======================================================================== # WEIGHT INVENTORY # ======================================================================== print("\n" + "=" * 80) print("WEIGHT INVENTORY") print("=" * 80) total_params = 0 param_groups = defaultdict(list) for name, param in model.named_parameters(): total_params += param.numel() # Group by component if "embed_tokens" in name: group = "Embedding" elif "lm_head" in name: group = "LM Head" elif "norm" in name and "layers" not in name: group = "Final Norm" elif "layers" in name: layer_num = name.split(".")[2] if "self_attn" in name: group = f"Layer {layer_num} - Attention" elif "mlp" in name: group = f"Layer {layer_num} - MLP" elif "norm" in name: group = f"Layer {layer_num} - Norms" else: group = f"Layer {layer_num} - Other" else: group = "Other" param_groups[group].append({ "name": name, "shape": tuple(param.shape), "numel": param.numel(), "dtype": str(param.dtype) }) print(f"\n--- Total Parameters: {total_params:,} ---") print(f" ({total_params / 1e6:.2f}M parameters)") # Print by group for group_name in sorted(param_groups.keys()): params = param_groups[group_name] group_total = sum(p["numel"] for p in params) print(f"\n### {group_name} ({group_total:,} params, {group_total/total_params*100:.2f}%)") for p in params: print(f" {p['name']}") print(f" Shape: {p['shape']}, Elements: {p['numel']:,}, Dtype: {p['dtype']}") # ======================================================================== # TOKENIZATION ANALYSIS # ======================================================================== print("\n" + "=" * 80) print("TOKENIZATION ANALYSIS") print("=" * 80) test_input = "47 + 86" print(f"\n--- Test Input: '{test_input}' ---") tokens = tokenizer(test_input, return_tensors="pt") input_ids = tokens["input_ids"][0] print(f"\nInput IDs: {input_ids.tolist()}") print(f"Number of tokens: {len(input_ids)}") print("\nToken breakdown:") for i, token_id in enumerate(input_ids): token_str = tokenizer.decode([token_id]) print(f" Position {i}: ID={token_id.item():5d}, Token='{token_str}'") # Additional tokenization tests print("\n--- Additional Tokenization Tests ---") test_cases = ["0", "1", "47", "86", "133", " + ", "="] for tc in test_cases: ids = tokenizer.encode(tc, add_special_tokens=False) decoded = [tokenizer.decode([i]) for i in ids] print(f" '{tc}' -> IDs: {ids}, Tokens: {decoded}") # ======================================================================== # HIDDEN STATE ANALYSIS # ======================================================================== print("\n" + "=" * 80) print("HIDDEN STATE ANALYSIS") print("=" * 80) print(f"\n--- Running inference on '{test_input}' ---") with torch.no_grad(): outputs = model(**tokens) hidden_states = outputs.hidden_states print(f"\nNumber of hidden state outputs: {len(hidden_states)}") print("(This includes embedding output + each layer's output)") print("\nHidden state shapes at each layer:") for i, hs in enumerate(hidden_states): layer_name = "Embedding" if i == 0 else f"Layer {i-1}" print(f" {layer_name}: {tuple(hs.shape)}") if i == 0: print(f" (batch_size=1, seq_len={hs.shape[1]}, hidden_dim={hs.shape[2]})") # Analyze hidden state statistics at different layers print("\n--- Hidden State Statistics (per layer) ---") for i, hs in enumerate(hidden_states): layer_name = "Embedding" if i == 0 else f"Layer {i-1}" hs_flat = hs.view(-1) print(f" {layer_name}:") print(f" Mean: {hs_flat.mean().item():.6f}") print(f" Std: {hs_flat.std().item():.6f}") print(f" Min: {hs_flat.min().item():.6f}") print(f" Max: {hs_flat.max().item():.6f}") # ======================================================================== # MODEL STRUCTURE DEEP DIVE # ======================================================================== print("\n" + "=" * 80) print("MODEL STRUCTURE DEEP DIVE") print("=" * 80) print("\n--- Model Architecture String ---") print(model) # ======================================================================== # SUMMARY DATA FOR REPORT # ======================================================================== summary = { "model_name": model_name, "total_params": total_params, "config": { "vocab_size": config.vocab_size, "hidden_size": config.hidden_size, "intermediate_size": config.intermediate_size, "num_hidden_layers": config.num_hidden_layers, "num_attention_heads": config.num_attention_heads, "num_kv_heads": getattr(config, 'num_key_value_heads', config.num_attention_heads), "head_dim": config.hidden_size // config.num_attention_heads, "max_position_embeddings": config.max_position_embeddings, "rms_norm_eps": getattr(config, 'rms_norm_eps', None), "rope_theta": getattr(config, 'rope_theta', None), "tie_word_embeddings": getattr(config, 'tie_word_embeddings', None), }, "tokenization": { "test_input": test_input, "token_ids": input_ids.tolist(), "num_tokens": len(input_ids), "tokens": [tokenizer.decode([tid]) for tid in input_ids] }, "hidden_states": { "num_outputs": len(hidden_states), "shape": list(hidden_states[0].shape) }, "param_groups": {k: {"count": len(v), "total": sum(p["numel"] for p in v)} for k, v in param_groups.items()} } # Save summary as JSON for report generation with open("D:/8bit-threshold-computer/llm_integration/smollm2_analysis.json", "w") as f: json.dump(summary, f, indent=2) print("\n" + "=" * 80) print("Analysis complete. Summary saved to smollm2_analysis.json") print("=" * 80) return summary, model, tokenizer, hidden_states, param_groups if __name__ == "__main__": summary, model, tokenizer, hidden_states, param_groups = analyze_smollm2()