Brandon Feng commited on
Commit
9eb0d4e
·
1 Parent(s): be8c689

first ckpt

Browse files
config.yaml ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pretrained_model_path: stabilityai/stable-video-diffusion-img2vid
2
+ output_dir: ./output/svd
3
+ train_data:
4
+ width: 128
5
+ height: 128
6
+ use_bucketing: false
7
+ return_mask: true
8
+ return_motion: true
9
+ sample_start_idx: 1
10
+ fps: 7
11
+ frame_step: 1
12
+ n_sample_frames: 7
13
+ single_video_path: data/0X1A0A263B22CCD966.avi
14
+ single_video_prompt: an ultrasound video of heart
15
+ fallback_prompt: ''
16
+ path: /data/vision/billf/scratch/tianyuan/dataset/ultrasound/EchoNet-Dynamic/Videos
17
+ json_path: /webvid/animation1.json
18
+ image_dir: /vlp/datasets/images/coco
19
+ image_json: /vlp/datasets/images/coco/coco_karpathy_train.json
20
+ video_dir: /webvid/webvid/data/videos
21
+ video_json: /webvid/webvid/data/40K.json
22
+ single_img_prompt: ''
23
+ validation_data:
24
+ prompt: ''
25
+ prompt_image: data/cond_frame0.jpg
26
+ sample_preview: true
27
+ num_frames: 14
28
+ width: 128
29
+ height: 128
30
+ num_inference_steps: 25
31
+ guidance_scale: 9
32
+ fps: 7
33
+ motion_bucket_id: 127
34
+ decode_chunk_size: 7
35
+ extra_train_data: []
36
+ dataset_types:
37
+ - folder
38
+ shuffle: true
39
+ validation_steps: 9000000000.0
40
+ trainable_modules:
41
+ - all
42
+ - attn1
43
+ - attn2
44
+ - conv_in
45
+ - temp_conv
46
+ - motion
47
+ extra_unet_params: null
48
+ extra_text_encoder_params: null
49
+ train_batch_size: 4
50
+ max_train_steps: 10000
51
+ learning_rate: 5.0e-06
52
+ scale_lr: false
53
+ lr_scheduler: constant
54
+ lr_warmup_steps: 0
55
+ adam_beta1: 0.9
56
+ adam_beta2: 0.999
57
+ adam_weight_decay: 0
58
+ adam_epsilon: 1.0e-08
59
+ max_grad_norm: 1.0
60
+ gradient_accumulation_steps: 1
61
+ gradient_checkpointing: false
62
+ text_encoder_gradient_checkpointing: false
63
+ checkpointing_steps: 2500
64
+ resume_from_checkpoint: null
65
+ resume_step: null
66
+ mixed_precision: fp16
67
+ use_8bit_adam: false
68
+ enable_xformers_memory_efficient_attention: false
69
+ enable_torch_2_attn: true
70
+ seed: 6
71
+ use_offset_noise: false
72
+ rescale_schedule: false
73
+ offset_noise_strength: 0.1
74
+ extend_dataset: false
75
+ cache_latents: false
76
+ cached_latent_dir: null
77
+ save_pretrained_model: true
78
+ logger_type: tensorboard
79
+ motion_mask: false
80
+ kwargs:
81
+ motion_strength: false
82
+ train_text_encoder: false
83
+ lora_version: cloneofsimo
84
+ use_unet_lora: false
85
+ use_text_lora: false
86
+ lora_unet_dropout: 0.1
87
+ lora_text_dropout: 0.1
88
+ save_lora_for_webui: true
89
+ only_lora_for_webui: false
90
+ unet_lora_modules:
91
+ - UNet3DConditionModel
92
+ text_encoder_lora_modules:
93
+ - CLIPEncoderLayer
94
+ lora_rank: 16
95
+ trainable_text_modules: null
96
+ dataset:
97
+ data_path: /data/vision/billf/scratch/tianyuan/dataset/ultrasound/EchoNet-Dynamic
98
+ deactivate_cache: false
99
+ fps: 8
100
+ duration: 2.0
101
+ grayscale: false
102
+ image_size: 128
image_encoder/config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/data/vision/billf/scratch/brandonf/.cache/huggingface/hub/models--stabilityai--stable-video-diffusion-img2vid/snapshots/1171a4af7a4e620fb2359c8cb3b312c30c655b93/image_encoder",
3
+ "architectures": [
4
+ "CLIPVisionModelWithProjection"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "dropout": 0.0,
8
+ "hidden_act": "gelu",
9
+ "hidden_size": 1280,
10
+ "image_size": 224,
11
+ "initializer_factor": 1.0,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 5120,
14
+ "layer_norm_eps": 1e-05,
15
+ "model_type": "clip_vision_model",
16
+ "num_attention_heads": 16,
17
+ "num_channels": 3,
18
+ "num_hidden_layers": 32,
19
+ "patch_size": 14,
20
+ "projection_dim": 1024,
21
+ "torch_dtype": "float32",
22
+ "transformers_version": "4.36.0"
23
+ }
image_encoder/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed1e5af7b4042ca30ec29999a4a5cfcac90b7fb610fd05ace834f2dcbb763eab
3
+ size 2528371296
model_index.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "StableVideoDiffusionPipeline",
3
+ "_diffusers_version": "0.26.0.dev0",
4
+ "_name_or_path": "stabilityai/stable-video-diffusion-img2vid",
5
+ "feature_extractor": [
6
+ "transformers",
7
+ "CLIPImageProcessor"
8
+ ],
9
+ "image_encoder": [
10
+ "transformers",
11
+ "CLIPVisionModelWithProjection"
12
+ ],
13
+ "scheduler": [
14
+ "diffusers",
15
+ "EulerDiscreteScheduler"
16
+ ],
17
+ "unet": [
18
+ "diffusers",
19
+ "UNetSpatioTemporalConditionModel"
20
+ ],
21
+ "vae": [
22
+ "diffusers",
23
+ "AutoencoderKLTemporalDecoder"
24
+ ]
25
+ }
scheduler/scheduler_config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "EulerDiscreteScheduler",
3
+ "_diffusers_version": "0.26.0.dev0",
4
+ "beta_end": 0.012,
5
+ "beta_schedule": "scaled_linear",
6
+ "beta_start": 0.00085,
7
+ "clip_sample": false,
8
+ "interpolation_type": "linear",
9
+ "num_train_timesteps": 1000,
10
+ "prediction_type": "v_prediction",
11
+ "rescale_betas_zero_snr": false,
12
+ "set_alpha_to_one": false,
13
+ "sigma_max": 700.0,
14
+ "sigma_min": 0.002,
15
+ "skip_prk_steps": true,
16
+ "steps_offset": 1,
17
+ "timestep_spacing": "leading",
18
+ "timestep_type": "continuous",
19
+ "trained_betas": null,
20
+ "use_karras_sigmas": true
21
+ }
unet/config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "UNetSpatioTemporalConditionModel",
3
+ "_diffusers_version": "0.26.0.dev0",
4
+ "_name_or_path": "/data/vision/billf/scratch/brandonf/.cache/huggingface/hub/models--stabilityai--stable-video-diffusion-img2vid/snapshots/1171a4af7a4e620fb2359c8cb3b312c30c655b93/unet",
5
+ "addition_time_embed_dim": 256,
6
+ "block_out_channels": [
7
+ 320,
8
+ 640,
9
+ 1280,
10
+ 1280
11
+ ],
12
+ "cross_attention_dim": 1024,
13
+ "down_block_types": [
14
+ "CrossAttnDownBlockSpatioTemporal",
15
+ "CrossAttnDownBlockSpatioTemporal",
16
+ "CrossAttnDownBlockSpatioTemporal",
17
+ "DownBlockSpatioTemporal"
18
+ ],
19
+ "in_channels": 8,
20
+ "layers_per_block": 2,
21
+ "num_attention_heads": [
22
+ 5,
23
+ 10,
24
+ 20,
25
+ 20
26
+ ],
27
+ "num_frames": 14,
28
+ "out_channels": 4,
29
+ "projection_class_embeddings_input_dim": 768,
30
+ "sample_size": 96,
31
+ "transformer_layers_per_block": 1,
32
+ "up_block_types": [
33
+ "UpBlockSpatioTemporal",
34
+ "CrossAttnUpBlockSpatioTemporal",
35
+ "CrossAttnUpBlockSpatioTemporal",
36
+ "CrossAttnUpBlockSpatioTemporal"
37
+ ]
38
+ }
unet/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6e3771ba35a9d98835fe9bbadf09174797b56b67d84e037ca0a798fc02f01c4
3
+ size 6098682464
vae/config.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "AutoencoderKLTemporalDecoder",
3
+ "_diffusers_version": "0.26.0.dev0",
4
+ "_name_or_path": "/data/vision/billf/scratch/brandonf/.cache/huggingface/hub/models--stabilityai--stable-video-diffusion-img2vid/snapshots/1171a4af7a4e620fb2359c8cb3b312c30c655b93/vae",
5
+ "block_out_channels": [
6
+ 128,
7
+ 256,
8
+ 512,
9
+ 512
10
+ ],
11
+ "down_block_types": [
12
+ "DownEncoderBlock2D",
13
+ "DownEncoderBlock2D",
14
+ "DownEncoderBlock2D",
15
+ "DownEncoderBlock2D"
16
+ ],
17
+ "force_upcast": true,
18
+ "in_channels": 3,
19
+ "latent_channels": 4,
20
+ "layers_per_block": 2,
21
+ "out_channels": 3,
22
+ "sample_size": 768,
23
+ "scaling_factor": 0.18215
24
+ }
vae/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9975042d7bee021bd53a72b1af14c8627d624f6547ec9abe661b68b962b88c49
3
+ size 391017740