diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..33b49d5a3ffa2d402a301904cc7a2868c5d0a6e2 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,34 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +samples/1758843894746__000000000_0.jpg filter=lfs diff=lfs merge=lfs -text +samples/1758844017427__000000000_1.jpg filter=lfs diff=lfs merge=lfs -text +samples/1758845231301__000000250_0.jpg filter=lfs diff=lfs merge=lfs -text +samples/1758845357677__000000250_1.jpg filter=lfs diff=lfs merge=lfs -text +samples/1758846565751__000000500_0.jpg filter=lfs diff=lfs merge=lfs -text +samples/1758846692407__000000500_1.jpg filter=lfs diff=lfs merge=lfs -text +samples/1758847901716__000000750_0.jpg filter=lfs diff=lfs merge=lfs -text +samples/1758848028069__000000750_1.jpg filter=lfs diff=lfs merge=lfs -text +samples/1758849236074__000001000_0.jpg filter=lfs diff=lfs merge=lfs -text +samples/1758849362296__000001000_1.jpg filter=lfs diff=lfs merge=lfs -text +samples/1758849488430__000001000_2.jpg filter=lfs diff=lfs merge=lfs -text +samples/1758850570560__000001250_0.jpg filter=lfs diff=lfs merge=lfs -text +samples/1758850696962__000001250_1.jpg filter=lfs diff=lfs merge=lfs -text +samples/1758851904671__000001500_0.jpg filter=lfs diff=lfs merge=lfs -text +samples/1758852030870__000001500_1.jpg filter=lfs diff=lfs merge=lfs -text +samples/1758853239252__000001750_0.jpg filter=lfs diff=lfs merge=lfs -text +samples/1758853365704__000001750_1.jpg filter=lfs diff=lfs merge=lfs -text +samples/1758854574518__000002000_0.jpg filter=lfs diff=lfs merge=lfs -text +samples/1758854700459__000002000_1.jpg filter=lfs diff=lfs merge=lfs -text +samples/1758855910203__000002250_0.jpg filter=lfs diff=lfs merge=lfs -text +samples/1758856036364__000002250_1.jpg filter=lfs diff=lfs merge=lfs -text +samples/1758857245953__000002500_0.jpg filter=lfs diff=lfs merge=lfs -text +samples/1758857372128__000002500_1.jpg filter=lfs diff=lfs merge=lfs -text +samples/1758858583467__000002750_0.jpg filter=lfs diff=lfs merge=lfs -text +samples/1758858709776__000002750_1.jpg filter=lfs diff=lfs merge=lfs -text +samples/1758859930728__000003000_0.jpg filter=lfs diff=lfs merge=lfs -text +samples/1758860056799__000003000_1.jpg filter=lfs diff=lfs merge=lfs -text +samples/1758861269184__000003250_0.jpg filter=lfs diff=lfs merge=lfs -text +samples/1758861395260__000003250_1.jpg filter=lfs diff=lfs merge=lfs -text +samples/1758862582214__000003500_0.jpg filter=lfs diff=lfs merge=lfs -text +samples/1758862708072__000003500_1.jpg filter=lfs diff=lfs merge=lfs -text diff --git a/.job_config.json b/.job_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2c94a57d685d80ff1cb04192c8992aa6a8cd8634 --- /dev/null +++ b/.job_config.json @@ -0,0 +1,133 @@ +{ + "job": "extension", + "config": { + "name": "wan_dewa", + "process": [ + { + "type": "diffusion_trainer", + "training_folder": "/app/ai-toolkit/output", + "sqlite_db_path": "/app/ai-toolkit/aitk_db.db", + "device": "cuda", + "trigger_word": null, + "performance_log_every": 10, + "network": { + "type": "lora", + "linear": 16, + "linear_alpha": 16, + "conv": 16, + "conv_alpha": 16, + "lokr_full_rank": true, + "lokr_factor": -1, + "network_kwargs": { + "ignore_if_contains": [] + } + }, + "save": { + "dtype": "bf16", + "save_every": 250, + "max_step_saves_to_keep": 10, + "save_format": "diffusers", + "push_to_hub": false + }, + "datasets": [ + { + "folder_path": "/app/ai-toolkit/datasets/d3w4", + "mask_path": null, + "mask_min_value": 0.1, + "default_caption": "", + "caption_ext": "txt", + "caption_dropout_rate": 0.05, + "cache_latents_to_disk": false, + "is_reg": false, + "network_weight": 1, + "resolution": [ + 512 + ], + "controls": [], + "shrink_video_to_frames": true, + "num_frames": 1, + "do_i2v": true, + "flip_x": false, + "flip_y": false + } + ], + "train": { + "batch_size": 1, + "bypass_guidance_embedding": false, + "steps": 3500, + "gradient_accumulation": 1, + "train_unet": true, + "train_text_encoder": false, + "gradient_checkpointing": true, + "noise_scheduler": "flowmatch", + "optimizer": "adamw8bit", + "timestep_type": "sigmoid", + "content_or_style": "balanced", + "optimizer_params": { + "weight_decay": 0.0001 + }, + "unload_text_encoder": false, + "cache_text_embeddings": true, + "lr": 0.0002, + "ema_config": { + "use_ema": false, + "ema_decay": 0.99 + }, + "skip_first_sample": false, + "force_first_sample": false, + "disable_sampling": false, + "dtype": "bf16", + "diff_output_preservation": false, + "diff_output_preservation_multiplier": 1, + "diff_output_preservation_class": "person", + "switch_boundary_every": 10, + "loss_type": "mse" + }, + "model": { + "name_or_path": "ai-toolkit/Wan2.2-T2V-A14B-Diffusers-bf16", + "quantize": true, + "qtype": "uint4|ostris/accuracy_recovery_adapters/wan22_14b_t2i_torchao_uint4.safetensors", + "quantize_te": true, + "qtype_te": "qfloat8", + "arch": "wan22_14b:t2v", + "low_vram": true, + "model_kwargs": { + "train_high_noise": true, + "train_low_noise": true + } + }, + "sample": { + "sampler": "flowmatch", + "sample_every": 250, + "width": 1024, + "height": 1024, + "samples": [ + { + "prompt": "A man named D3W4 , playing chess at the park, bomb going off in the background" + }, + { + "prompt": "A man named D3W4 holding a coffee cup, in a beanie, sitting at a cafe" + }, + { + "prompt": "A man named D3W4 playing the guitar, on stage, singing a song, laser lights, punk rocker" + }, + { + "prompt": "photo of a man named D3W4, white background, medium shot, modeling clothing, studio lighting, white backdrop" + } + ], + "neg": "", + "seed": 42, + "walk_seed": true, + "guidance_scale": 4, + "sample_steps": 25, + "num_frames": 1, + "fps": 1 + } + } + ] + }, + "meta": { + "name": "[name]", + "version": "1.0" + } +} \ No newline at end of file diff --git a/config.yaml b/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..758f53e3c8d5a94bf2e5dc14e2fa4100968d910d --- /dev/null +++ b/config.yaml @@ -0,0 +1,107 @@ +job: extension +config: + name: wan_dewa + process: + - type: diffusion_trainer + training_folder: /app/ai-toolkit/output + sqlite_db_path: /app/ai-toolkit/aitk_db.db + device: cuda + trigger_word: null + performance_log_every: 10 + network: + type: lora + linear: 16 + linear_alpha: 16 + conv: 16 + conv_alpha: 16 + lokr_full_rank: true + lokr_factor: -1 + network_kwargs: + ignore_if_contains: [] + save: + dtype: bf16 + save_every: 250 + max_step_saves_to_keep: 10 + save_format: diffusers + push_to_hub: false + datasets: + - folder_path: /app/ai-toolkit/datasets/d3w4 + mask_path: null + mask_min_value: 0.1 + default_caption: '' + caption_ext: txt + caption_dropout_rate: 0.05 + cache_latents_to_disk: false + is_reg: false + network_weight: 1 + resolution: + - 512 + controls: [] + shrink_video_to_frames: true + num_frames: 1 + do_i2v: true + flip_x: false + flip_y: false + train: + batch_size: 1 + bypass_guidance_embedding: false + steps: 3500 + gradient_accumulation: 1 + train_unet: true + train_text_encoder: false + gradient_checkpointing: true + noise_scheduler: flowmatch + optimizer: adamw8bit + timestep_type: sigmoid + content_or_style: balanced + optimizer_params: + weight_decay: 0.0001 + unload_text_encoder: false + cache_text_embeddings: true + lr: 0.0002 + ema_config: + use_ema: false + ema_decay: 0.99 + skip_first_sample: false + force_first_sample: false + disable_sampling: false + dtype: bf16 + diff_output_preservation: false + diff_output_preservation_multiplier: 1 + diff_output_preservation_class: person + switch_boundary_every: 10 + loss_type: mse + model: + name_or_path: ai-toolkit/Wan2.2-T2V-A14B-Diffusers-bf16 + quantize: true + qtype: uint4|ostris/accuracy_recovery_adapters/wan22_14b_t2i_torchao_uint4.safetensors + quantize_te: true + qtype_te: qfloat8 + arch: wan22_14b:t2v + low_vram: true + model_kwargs: + train_high_noise: true + train_low_noise: true + sample: + sampler: flowmatch + sample_every: 250 + width: 1024 + height: 1024 + samples: + - prompt: A man named D3W4 , playing chess at the park, bomb going off in the + background + - prompt: A man named D3W4 holding a coffee cup, in a beanie, sitting at a cafe + - prompt: A man named D3W4 playing the guitar, on stage, singing a song, laser + lights, punk rocker + - prompt: photo of a man named D3W4, white background, medium shot, modeling + clothing, studio lighting, white backdrop + neg: '' + seed: 42 + walk_seed: true + guidance_scale: 4 + sample_steps: 25 + num_frames: 1 + fps: 1 +meta: + name: wan_dewa + version: '1.0' diff --git a/log.txt b/log.txt new file mode 100644 index 0000000000000000000000000000000000000000..bd08add64e354572b3087028b989a65ee97fc595 --- /dev/null +++ b/log.txt @@ -0,0 +1,6692 @@ +Running 1 job +{ + "type": "diffusion_trainer", + "training_folder": "/app/ai-toolkit/output", + "sqlite_db_path": "/app/ai-toolkit/aitk_db.db", + "device": "cuda", + "trigger_word": null, + "performance_log_every": 10, + "network": { + "type": "lora", + "linear": 16, + "linear_alpha": 16, + "conv": 16, + "conv_alpha": 16, + "lokr_full_rank": true, + "lokr_factor": -1, + "network_kwargs": { + "ignore_if_contains": [] + } + }, + "save": { + "dtype": "bf16", + "save_every": 250, + "max_step_saves_to_keep": 10, + "save_format": "diffusers", + "push_to_hub": false + }, + "datasets": [ + { + "folder_path": "/app/ai-toolkit/datasets/d3w4", + "mask_path": null, + "mask_min_value": 0.1, + "default_caption": "", + "caption_ext": "txt", + "caption_dropout_rate": 0.05, + "cache_latents_to_disk": false, + "is_reg": false, + "network_weight": 1, + "resolution": [ + 512 + ], + "controls": [], + "shrink_video_to_frames": true, + "num_frames": 1, + "do_i2v": true, + "flip_x": false, + "flip_y": false + } + ], + "train": { + "batch_size": 1, + "bypass_guidance_embedding": false, + "steps": 3500, + "gradient_accumulation": 1, + "train_unet": true, + "train_text_encoder": false, + "gradient_checkpointing": true, + "noise_scheduler": "flowmatch", + "optimizer": "adamw8bit", + "timestep_type": "sigmoid", + "content_or_style": "balanced", + "optimizer_params": { + "weight_decay": 0.0001 + }, + "unload_text_encoder": false, + "cache_text_embeddings": true, + "lr": 0.0002, + "ema_config": { + "use_ema": false, + "ema_decay": 0.99 + }, + "skip_first_sample": false, + "force_first_sample": false, + "disable_sampling": false, + "dtype": "bf16", + "diff_output_preservation": false, + "diff_output_preservation_multiplier": 1, + "diff_output_preservation_class": "person", + "switch_boundary_every": 10, + "loss_type": "mse" + }, + "model": { + "name_or_path": "ai-toolkit/Wan2.2-T2V-A14B-Diffusers-bf16", + "quantize": true, + "qtype": "uint4|ostris/accuracy_recovery_adapters/wan22_14b_t2i_torchao_uint4.safetensors", + "quantize_te": true, + "qtype_te": "qfloat8", + "arch": "wan22_14b:t2v", + "low_vram": true, + "model_kwargs": { + "train_high_noise": true, + "train_low_noise": true + } + }, + "sample": { + "sampler": "flowmatch", + "sample_every": 250, + "width": 1024, + "height": 1024, + "samples": [ + { + "prompt": "A man named D3W4 , playing chess at the park, bomb going off in the background" + }, + { + "prompt": "A man named D3W4 holding a coffee cup, in a beanie, sitting at a cafe" + }, + { + "prompt": "A man named D3W4 playing the guitar, on stage, singing a song, laser lights, punk rocker" + }, + { + "prompt": "photo of a man named D3W4, white background, medium shot, modeling clothing, studio lighting, white backdrop" + } + ], + "neg": "", + "seed": 42, + "walk_seed": true, + "guidance_scale": 4, + "sample_steps": 25, + "num_frames": 1, + "fps": 1 + } +} +Using SQLite database at /app/ai-toolkit/aitk_db.db +Job ID: "bbfb709b-6853-4f38-a51e-c2b7afbfb429" + +############################################# +# Running job: wan_dewa +############################################# + + +Running 1 process +Loading Wan model +Loading transformer 1 + config.json: 0%| | 0.00/550 [00:00