Upload 6 files
Browse files
audio_vae/config.json
CHANGED
|
@@ -1,19 +1,20 @@
|
|
| 1 |
{
|
| 2 |
"_class_name": "AutoencoderKLLTX2Audio",
|
| 3 |
-
"_diffusers_version": "0.
|
| 4 |
"attn_resolutions": null,
|
| 5 |
-
"base_channels":
|
| 6 |
"causality_axis": "height",
|
| 7 |
"ch_mult": [
|
| 8 |
1,
|
| 9 |
-
2
|
|
|
|
| 10 |
],
|
| 11 |
"double_z": true,
|
| 12 |
"dropout": 0.0,
|
| 13 |
"in_channels": 2,
|
| 14 |
"is_causal": true,
|
| 15 |
-
"latent_channels":
|
| 16 |
-
"mel_bins":
|
| 17 |
"mel_hop_length": 160,
|
| 18 |
"mid_block_add_attention": false,
|
| 19 |
"norm_type": "pixel",
|
|
|
|
| 1 |
{
|
| 2 |
"_class_name": "AutoencoderKLLTX2Audio",
|
| 3 |
+
"_diffusers_version": "0.38.0.dev0",
|
| 4 |
"attn_resolutions": null,
|
| 5 |
+
"base_channels": 128,
|
| 6 |
"causality_axis": "height",
|
| 7 |
"ch_mult": [
|
| 8 |
1,
|
| 9 |
+
2,
|
| 10 |
+
4
|
| 11 |
],
|
| 12 |
"double_z": true,
|
| 13 |
"dropout": 0.0,
|
| 14 |
"in_channels": 2,
|
| 15 |
"is_causal": true,
|
| 16 |
+
"latent_channels": 8,
|
| 17 |
+
"mel_bins": 64,
|
| 18 |
"mel_hop_length": 160,
|
| 19 |
"mid_block_add_attention": false,
|
| 20 |
"norm_type": "pixel",
|
audio_vae/diffusion_pytorch_model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:615ce5acd612c830617d3197d009254bd6c8356e6917aa002a6f96ed01fbbb17
|
| 3 |
+
size 163442552
|
transformer/config.json
CHANGED
|
@@ -1,16 +1,18 @@
|
|
| 1 |
{
|
| 2 |
"_class_name": "LTX2VideoTransformer3DModel",
|
| 3 |
-
"_diffusers_version": "0.
|
| 4 |
"activation_fn": "gelu-approximate",
|
| 5 |
"attention_bias": true,
|
| 6 |
"attention_head_dim": 32,
|
| 7 |
"attention_out_bias": true,
|
| 8 |
"audio_attention_head_dim": 32,
|
| 9 |
"audio_cross_attention_dim": 64,
|
|
|
|
|
|
|
| 10 |
"audio_hop_length": 160,
|
| 11 |
-
"audio_in_channels":
|
| 12 |
"audio_num_attention_heads": 2,
|
| 13 |
-
"audio_out_channels":
|
| 14 |
"audio_patch_size": 1,
|
| 15 |
"audio_patch_size_t": 1,
|
| 16 |
"audio_pos_embed_max_pos": 20,
|
|
@@ -21,7 +23,9 @@
|
|
| 21 |
"caption_channels": 64,
|
| 22 |
"causal_offset": 1,
|
| 23 |
"cross_attention_dim": 64,
|
|
|
|
| 24 |
"cross_attn_timestep_scale_multiplier": 1000,
|
|
|
|
| 25 |
"in_channels": 4,
|
| 26 |
"norm_elementwise_affine": false,
|
| 27 |
"norm_eps": 1e-06,
|
|
@@ -30,12 +34,14 @@
|
|
| 30 |
"out_channels": 4,
|
| 31 |
"patch_size": 1,
|
| 32 |
"patch_size_t": 1,
|
|
|
|
| 33 |
"pos_embed_max_pos": 20,
|
| 34 |
"qk_norm": "rms_norm_across_heads",
|
| 35 |
"rope_double_precision": true,
|
| 36 |
"rope_theta": 10000.0,
|
| 37 |
"rope_type": "interleaved",
|
| 38 |
"timestep_scale_multiplier": 1000,
|
|
|
|
| 39 |
"vae_scale_factors": [
|
| 40 |
8,
|
| 41 |
32,
|
|
|
|
| 1 |
{
|
| 2 |
"_class_name": "LTX2VideoTransformer3DModel",
|
| 3 |
+
"_diffusers_version": "0.38.0.dev0",
|
| 4 |
"activation_fn": "gelu-approximate",
|
| 5 |
"attention_bias": true,
|
| 6 |
"attention_head_dim": 32,
|
| 7 |
"attention_out_bias": true,
|
| 8 |
"audio_attention_head_dim": 32,
|
| 9 |
"audio_cross_attention_dim": 64,
|
| 10 |
+
"audio_cross_attn_mod": false,
|
| 11 |
+
"audio_gated_attn": false,
|
| 12 |
"audio_hop_length": 160,
|
| 13 |
+
"audio_in_channels": 128,
|
| 14 |
"audio_num_attention_heads": 2,
|
| 15 |
+
"audio_out_channels": 128,
|
| 16 |
"audio_patch_size": 1,
|
| 17 |
"audio_patch_size_t": 1,
|
| 18 |
"audio_pos_embed_max_pos": 20,
|
|
|
|
| 23 |
"caption_channels": 64,
|
| 24 |
"causal_offset": 1,
|
| 25 |
"cross_attention_dim": 64,
|
| 26 |
+
"cross_attn_mod": false,
|
| 27 |
"cross_attn_timestep_scale_multiplier": 1000,
|
| 28 |
+
"gated_attn": false,
|
| 29 |
"in_channels": 4,
|
| 30 |
"norm_elementwise_affine": false,
|
| 31 |
"norm_eps": 1e-06,
|
|
|
|
| 34 |
"out_channels": 4,
|
| 35 |
"patch_size": 1,
|
| 36 |
"patch_size_t": 1,
|
| 37 |
+
"perturbed_attn": false,
|
| 38 |
"pos_embed_max_pos": 20,
|
| 39 |
"qk_norm": "rms_norm_across_heads",
|
| 40 |
"rope_double_precision": true,
|
| 41 |
"rope_theta": 10000.0,
|
| 42 |
"rope_type": "interleaved",
|
| 43 |
"timestep_scale_multiplier": 1000,
|
| 44 |
+
"use_prompt_embeddings": true,
|
| 45 |
"vae_scale_factors": [
|
| 46 |
8,
|
| 47 |
32,
|
transformer/diffusion_pytorch_model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0ba703eed695c8dff6456e91b42ad4497b07b9d036870bb775e8074d907c36f0
|
| 3 |
+
size 2364304
|
vocoder/config.json
CHANGED
|
@@ -1,8 +1,14 @@
|
|
| 1 |
{
|
| 2 |
"_class_name": "LTX2Vocoder",
|
| 3 |
-
"_diffusers_version": "0.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
"hidden_channels": 32,
|
| 5 |
-
"in_channels":
|
| 6 |
"leaky_relu_negative_slope": 0.1,
|
| 7 |
"out_channels": 2,
|
| 8 |
"output_sampling_rate": 24000,
|
|
|
|
| 1 |
{
|
| 2 |
"_class_name": "LTX2Vocoder",
|
| 3 |
+
"_diffusers_version": "0.38.0.dev0",
|
| 4 |
+
"act_fn": "leaky_relu",
|
| 5 |
+
"antialias": false,
|
| 6 |
+
"antialias_kernel_size": 12,
|
| 7 |
+
"antialias_ratio": 2,
|
| 8 |
+
"final_act_fn": "tanh",
|
| 9 |
+
"final_bias": true,
|
| 10 |
"hidden_channels": 32,
|
| 11 |
+
"in_channels": 128,
|
| 12 |
"leaky_relu_negative_slope": 0.1,
|
| 13 |
"out_channels": 2,
|
| 14 |
"output_sampling_rate": 24000,
|
vocoder/diffusion_pytorch_model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:885a3ef39987df2a9840e963d611ec1ed4172af8532e84c772a8fdbb89335f17
|
| 3 |
+
size 143448
|