Brandon Feng commited on
Commit ·
9eb0d4e
1
Parent(s): be8c689
first ckpt
Browse files- config.yaml +102 -0
- image_encoder/config.json +23 -0
- image_encoder/model.safetensors +3 -0
- model_index.json +25 -0
- scheduler/scheduler_config.json +21 -0
- unet/config.json +38 -0
- unet/diffusion_pytorch_model.safetensors +3 -0
- vae/config.json +24 -0
- vae/diffusion_pytorch_model.safetensors +3 -0
config.yaml
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
pretrained_model_path: stabilityai/stable-video-diffusion-img2vid
|
| 2 |
+
output_dir: ./output/svd
|
| 3 |
+
train_data:
|
| 4 |
+
width: 128
|
| 5 |
+
height: 128
|
| 6 |
+
use_bucketing: false
|
| 7 |
+
return_mask: true
|
| 8 |
+
return_motion: true
|
| 9 |
+
sample_start_idx: 1
|
| 10 |
+
fps: 7
|
| 11 |
+
frame_step: 1
|
| 12 |
+
n_sample_frames: 7
|
| 13 |
+
single_video_path: data/0X1A0A263B22CCD966.avi
|
| 14 |
+
single_video_prompt: an ultrasound video of heart
|
| 15 |
+
fallback_prompt: ''
|
| 16 |
+
path: /data/vision/billf/scratch/tianyuan/dataset/ultrasound/EchoNet-Dynamic/Videos
|
| 17 |
+
json_path: /webvid/animation1.json
|
| 18 |
+
image_dir: /vlp/datasets/images/coco
|
| 19 |
+
image_json: /vlp/datasets/images/coco/coco_karpathy_train.json
|
| 20 |
+
video_dir: /webvid/webvid/data/videos
|
| 21 |
+
video_json: /webvid/webvid/data/40K.json
|
| 22 |
+
single_img_prompt: ''
|
| 23 |
+
validation_data:
|
| 24 |
+
prompt: ''
|
| 25 |
+
prompt_image: data/cond_frame0.jpg
|
| 26 |
+
sample_preview: true
|
| 27 |
+
num_frames: 14
|
| 28 |
+
width: 128
|
| 29 |
+
height: 128
|
| 30 |
+
num_inference_steps: 25
|
| 31 |
+
guidance_scale: 9
|
| 32 |
+
fps: 7
|
| 33 |
+
motion_bucket_id: 127
|
| 34 |
+
decode_chunk_size: 7
|
| 35 |
+
extra_train_data: []
|
| 36 |
+
dataset_types:
|
| 37 |
+
- folder
|
| 38 |
+
shuffle: true
|
| 39 |
+
validation_steps: 9000000000.0
|
| 40 |
+
trainable_modules:
|
| 41 |
+
- all
|
| 42 |
+
- attn1
|
| 43 |
+
- attn2
|
| 44 |
+
- conv_in
|
| 45 |
+
- temp_conv
|
| 46 |
+
- motion
|
| 47 |
+
extra_unet_params: null
|
| 48 |
+
extra_text_encoder_params: null
|
| 49 |
+
train_batch_size: 4
|
| 50 |
+
max_train_steps: 10000
|
| 51 |
+
learning_rate: 5.0e-06
|
| 52 |
+
scale_lr: false
|
| 53 |
+
lr_scheduler: constant
|
| 54 |
+
lr_warmup_steps: 0
|
| 55 |
+
adam_beta1: 0.9
|
| 56 |
+
adam_beta2: 0.999
|
| 57 |
+
adam_weight_decay: 0
|
| 58 |
+
adam_epsilon: 1.0e-08
|
| 59 |
+
max_grad_norm: 1.0
|
| 60 |
+
gradient_accumulation_steps: 1
|
| 61 |
+
gradient_checkpointing: false
|
| 62 |
+
text_encoder_gradient_checkpointing: false
|
| 63 |
+
checkpointing_steps: 2500
|
| 64 |
+
resume_from_checkpoint: null
|
| 65 |
+
resume_step: null
|
| 66 |
+
mixed_precision: fp16
|
| 67 |
+
use_8bit_adam: false
|
| 68 |
+
enable_xformers_memory_efficient_attention: false
|
| 69 |
+
enable_torch_2_attn: true
|
| 70 |
+
seed: 6
|
| 71 |
+
use_offset_noise: false
|
| 72 |
+
rescale_schedule: false
|
| 73 |
+
offset_noise_strength: 0.1
|
| 74 |
+
extend_dataset: false
|
| 75 |
+
cache_latents: false
|
| 76 |
+
cached_latent_dir: null
|
| 77 |
+
save_pretrained_model: true
|
| 78 |
+
logger_type: tensorboard
|
| 79 |
+
motion_mask: false
|
| 80 |
+
kwargs:
|
| 81 |
+
motion_strength: false
|
| 82 |
+
train_text_encoder: false
|
| 83 |
+
lora_version: cloneofsimo
|
| 84 |
+
use_unet_lora: false
|
| 85 |
+
use_text_lora: false
|
| 86 |
+
lora_unet_dropout: 0.1
|
| 87 |
+
lora_text_dropout: 0.1
|
| 88 |
+
save_lora_for_webui: true
|
| 89 |
+
only_lora_for_webui: false
|
| 90 |
+
unet_lora_modules:
|
| 91 |
+
- UNet3DConditionModel
|
| 92 |
+
text_encoder_lora_modules:
|
| 93 |
+
- CLIPEncoderLayer
|
| 94 |
+
lora_rank: 16
|
| 95 |
+
trainable_text_modules: null
|
| 96 |
+
dataset:
|
| 97 |
+
data_path: /data/vision/billf/scratch/tianyuan/dataset/ultrasound/EchoNet-Dynamic
|
| 98 |
+
deactivate_cache: false
|
| 99 |
+
fps: 8
|
| 100 |
+
duration: 2.0
|
| 101 |
+
grayscale: false
|
| 102 |
+
image_size: 128
|
image_encoder/config.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_name_or_path": "/data/vision/billf/scratch/brandonf/.cache/huggingface/hub/models--stabilityai--stable-video-diffusion-img2vid/snapshots/1171a4af7a4e620fb2359c8cb3b312c30c655b93/image_encoder",
|
| 3 |
+
"architectures": [
|
| 4 |
+
"CLIPVisionModelWithProjection"
|
| 5 |
+
],
|
| 6 |
+
"attention_dropout": 0.0,
|
| 7 |
+
"dropout": 0.0,
|
| 8 |
+
"hidden_act": "gelu",
|
| 9 |
+
"hidden_size": 1280,
|
| 10 |
+
"image_size": 224,
|
| 11 |
+
"initializer_factor": 1.0,
|
| 12 |
+
"initializer_range": 0.02,
|
| 13 |
+
"intermediate_size": 5120,
|
| 14 |
+
"layer_norm_eps": 1e-05,
|
| 15 |
+
"model_type": "clip_vision_model",
|
| 16 |
+
"num_attention_heads": 16,
|
| 17 |
+
"num_channels": 3,
|
| 18 |
+
"num_hidden_layers": 32,
|
| 19 |
+
"patch_size": 14,
|
| 20 |
+
"projection_dim": 1024,
|
| 21 |
+
"torch_dtype": "float32",
|
| 22 |
+
"transformers_version": "4.36.0"
|
| 23 |
+
}
|
image_encoder/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ed1e5af7b4042ca30ec29999a4a5cfcac90b7fb610fd05ace834f2dcbb763eab
|
| 3 |
+
size 2528371296
|
model_index.json
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_class_name": "StableVideoDiffusionPipeline",
|
| 3 |
+
"_diffusers_version": "0.26.0.dev0",
|
| 4 |
+
"_name_or_path": "stabilityai/stable-video-diffusion-img2vid",
|
| 5 |
+
"feature_extractor": [
|
| 6 |
+
"transformers",
|
| 7 |
+
"CLIPImageProcessor"
|
| 8 |
+
],
|
| 9 |
+
"image_encoder": [
|
| 10 |
+
"transformers",
|
| 11 |
+
"CLIPVisionModelWithProjection"
|
| 12 |
+
],
|
| 13 |
+
"scheduler": [
|
| 14 |
+
"diffusers",
|
| 15 |
+
"EulerDiscreteScheduler"
|
| 16 |
+
],
|
| 17 |
+
"unet": [
|
| 18 |
+
"diffusers",
|
| 19 |
+
"UNetSpatioTemporalConditionModel"
|
| 20 |
+
],
|
| 21 |
+
"vae": [
|
| 22 |
+
"diffusers",
|
| 23 |
+
"AutoencoderKLTemporalDecoder"
|
| 24 |
+
]
|
| 25 |
+
}
|
scheduler/scheduler_config.json
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_class_name": "EulerDiscreteScheduler",
|
| 3 |
+
"_diffusers_version": "0.26.0.dev0",
|
| 4 |
+
"beta_end": 0.012,
|
| 5 |
+
"beta_schedule": "scaled_linear",
|
| 6 |
+
"beta_start": 0.00085,
|
| 7 |
+
"clip_sample": false,
|
| 8 |
+
"interpolation_type": "linear",
|
| 9 |
+
"num_train_timesteps": 1000,
|
| 10 |
+
"prediction_type": "v_prediction",
|
| 11 |
+
"rescale_betas_zero_snr": false,
|
| 12 |
+
"set_alpha_to_one": false,
|
| 13 |
+
"sigma_max": 700.0,
|
| 14 |
+
"sigma_min": 0.002,
|
| 15 |
+
"skip_prk_steps": true,
|
| 16 |
+
"steps_offset": 1,
|
| 17 |
+
"timestep_spacing": "leading",
|
| 18 |
+
"timestep_type": "continuous",
|
| 19 |
+
"trained_betas": null,
|
| 20 |
+
"use_karras_sigmas": true
|
| 21 |
+
}
|
unet/config.json
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_class_name": "UNetSpatioTemporalConditionModel",
|
| 3 |
+
"_diffusers_version": "0.26.0.dev0",
|
| 4 |
+
"_name_or_path": "/data/vision/billf/scratch/brandonf/.cache/huggingface/hub/models--stabilityai--stable-video-diffusion-img2vid/snapshots/1171a4af7a4e620fb2359c8cb3b312c30c655b93/unet",
|
| 5 |
+
"addition_time_embed_dim": 256,
|
| 6 |
+
"block_out_channels": [
|
| 7 |
+
320,
|
| 8 |
+
640,
|
| 9 |
+
1280,
|
| 10 |
+
1280
|
| 11 |
+
],
|
| 12 |
+
"cross_attention_dim": 1024,
|
| 13 |
+
"down_block_types": [
|
| 14 |
+
"CrossAttnDownBlockSpatioTemporal",
|
| 15 |
+
"CrossAttnDownBlockSpatioTemporal",
|
| 16 |
+
"CrossAttnDownBlockSpatioTemporal",
|
| 17 |
+
"DownBlockSpatioTemporal"
|
| 18 |
+
],
|
| 19 |
+
"in_channels": 8,
|
| 20 |
+
"layers_per_block": 2,
|
| 21 |
+
"num_attention_heads": [
|
| 22 |
+
5,
|
| 23 |
+
10,
|
| 24 |
+
20,
|
| 25 |
+
20
|
| 26 |
+
],
|
| 27 |
+
"num_frames": 14,
|
| 28 |
+
"out_channels": 4,
|
| 29 |
+
"projection_class_embeddings_input_dim": 768,
|
| 30 |
+
"sample_size": 96,
|
| 31 |
+
"transformer_layers_per_block": 1,
|
| 32 |
+
"up_block_types": [
|
| 33 |
+
"UpBlockSpatioTemporal",
|
| 34 |
+
"CrossAttnUpBlockSpatioTemporal",
|
| 35 |
+
"CrossAttnUpBlockSpatioTemporal",
|
| 36 |
+
"CrossAttnUpBlockSpatioTemporal"
|
| 37 |
+
]
|
| 38 |
+
}
|
unet/diffusion_pytorch_model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b6e3771ba35a9d98835fe9bbadf09174797b56b67d84e037ca0a798fc02f01c4
|
| 3 |
+
size 6098682464
|
vae/config.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_class_name": "AutoencoderKLTemporalDecoder",
|
| 3 |
+
"_diffusers_version": "0.26.0.dev0",
|
| 4 |
+
"_name_or_path": "/data/vision/billf/scratch/brandonf/.cache/huggingface/hub/models--stabilityai--stable-video-diffusion-img2vid/snapshots/1171a4af7a4e620fb2359c8cb3b312c30c655b93/vae",
|
| 5 |
+
"block_out_channels": [
|
| 6 |
+
128,
|
| 7 |
+
256,
|
| 8 |
+
512,
|
| 9 |
+
512
|
| 10 |
+
],
|
| 11 |
+
"down_block_types": [
|
| 12 |
+
"DownEncoderBlock2D",
|
| 13 |
+
"DownEncoderBlock2D",
|
| 14 |
+
"DownEncoderBlock2D",
|
| 15 |
+
"DownEncoderBlock2D"
|
| 16 |
+
],
|
| 17 |
+
"force_upcast": true,
|
| 18 |
+
"in_channels": 3,
|
| 19 |
+
"latent_channels": 4,
|
| 20 |
+
"layers_per_block": 2,
|
| 21 |
+
"out_channels": 3,
|
| 22 |
+
"sample_size": 768,
|
| 23 |
+
"scaling_factor": 0.18215
|
| 24 |
+
}
|
vae/diffusion_pytorch_model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9975042d7bee021bd53a72b1af14c8627d624f6547ec9abe661b68b962b88c49
|
| 3 |
+
size 391017740
|