OzzyGT HF Staff commited on
Commit
a0a7816
·
verified ·
1 Parent(s): a3dac04

Upload 6 files

Browse files
audio_vae/config.json CHANGED
@@ -1,19 +1,20 @@
1
  {
2
  "_class_name": "AutoencoderKLLTX2Audio",
3
- "_diffusers_version": "0.37.0.dev0",
4
  "attn_resolutions": null,
5
- "base_channels": 16,
6
  "causality_axis": "height",
7
  "ch_mult": [
8
  1,
9
- 2
 
10
  ],
11
  "double_z": true,
12
  "dropout": 0.0,
13
  "in_channels": 2,
14
  "is_causal": true,
15
- "latent_channels": 4,
16
- "mel_bins": 16,
17
  "mel_hop_length": 160,
18
  "mid_block_add_attention": false,
19
  "norm_type": "pixel",
 
1
  {
2
  "_class_name": "AutoencoderKLLTX2Audio",
3
+ "_diffusers_version": "0.38.0.dev0",
4
  "attn_resolutions": null,
5
+ "base_channels": 128,
6
  "causality_axis": "height",
7
  "ch_mult": [
8
  1,
9
+ 2,
10
+ 4
11
  ],
12
  "double_z": true,
13
  "dropout": 0.0,
14
  "in_channels": 2,
15
  "is_causal": true,
16
+ "latent_channels": 8,
17
+ "mel_bins": 64,
18
  "mel_hop_length": 160,
19
  "mid_block_add_attention": false,
20
  "norm_type": "pixel",
audio_vae/diffusion_pytorch_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c49400e9ca1e90614af15569e829d361b42eaa1f8fc676dc9db4581be6462088
3
- size 321628
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:615ce5acd612c830617d3197d009254bd6c8356e6917aa002a6f96ed01fbbb17
3
+ size 163442552
transformer/config.json CHANGED
@@ -1,16 +1,18 @@
1
  {
2
  "_class_name": "LTX2VideoTransformer3DModel",
3
- "_diffusers_version": "0.37.0.dev0",
4
  "activation_fn": "gelu-approximate",
5
  "attention_bias": true,
6
  "attention_head_dim": 32,
7
  "attention_out_bias": true,
8
  "audio_attention_head_dim": 32,
9
  "audio_cross_attention_dim": 64,
 
 
10
  "audio_hop_length": 160,
11
- "audio_in_channels": 16,
12
  "audio_num_attention_heads": 2,
13
- "audio_out_channels": 16,
14
  "audio_patch_size": 1,
15
  "audio_patch_size_t": 1,
16
  "audio_pos_embed_max_pos": 20,
@@ -21,7 +23,9 @@
21
  "caption_channels": 64,
22
  "causal_offset": 1,
23
  "cross_attention_dim": 64,
 
24
  "cross_attn_timestep_scale_multiplier": 1000,
 
25
  "in_channels": 4,
26
  "norm_elementwise_affine": false,
27
  "norm_eps": 1e-06,
@@ -30,12 +34,14 @@
30
  "out_channels": 4,
31
  "patch_size": 1,
32
  "patch_size_t": 1,
 
33
  "pos_embed_max_pos": 20,
34
  "qk_norm": "rms_norm_across_heads",
35
  "rope_double_precision": true,
36
  "rope_theta": 10000.0,
37
  "rope_type": "interleaved",
38
  "timestep_scale_multiplier": 1000,
 
39
  "vae_scale_factors": [
40
  8,
41
  32,
 
1
  {
2
  "_class_name": "LTX2VideoTransformer3DModel",
3
+ "_diffusers_version": "0.38.0.dev0",
4
  "activation_fn": "gelu-approximate",
5
  "attention_bias": true,
6
  "attention_head_dim": 32,
7
  "attention_out_bias": true,
8
  "audio_attention_head_dim": 32,
9
  "audio_cross_attention_dim": 64,
10
+ "audio_cross_attn_mod": false,
11
+ "audio_gated_attn": false,
12
  "audio_hop_length": 160,
13
+ "audio_in_channels": 128,
14
  "audio_num_attention_heads": 2,
15
+ "audio_out_channels": 128,
16
  "audio_patch_size": 1,
17
  "audio_patch_size_t": 1,
18
  "audio_pos_embed_max_pos": 20,
 
23
  "caption_channels": 64,
24
  "causal_offset": 1,
25
  "cross_attention_dim": 64,
26
+ "cross_attn_mod": false,
27
  "cross_attn_timestep_scale_multiplier": 1000,
28
+ "gated_attn": false,
29
  "in_channels": 4,
30
  "norm_elementwise_affine": false,
31
  "norm_eps": 1e-06,
 
34
  "out_channels": 4,
35
  "patch_size": 1,
36
  "patch_size_t": 1,
37
+ "perturbed_attn": false,
38
  "pos_embed_max_pos": 20,
39
  "qk_norm": "rms_norm_across_heads",
40
  "rope_double_precision": true,
41
  "rope_theta": 10000.0,
42
  "rope_type": "interleaved",
43
  "timestep_scale_multiplier": 1000,
44
+ "use_prompt_embeddings": true,
45
  "vae_scale_factors": [
46
  8,
47
  32,
transformer/diffusion_pytorch_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9ac96571eb5a10bf4a6756fe181f450e2efae73ac023f312556445ff83e7cd31
3
- size 1164152
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ba703eed695c8dff6456e91b42ad4497b07b9d036870bb775e8074d907c36f0
3
+ size 2364304
vocoder/config.json CHANGED
@@ -1,8 +1,14 @@
1
  {
2
  "_class_name": "LTX2Vocoder",
3
- "_diffusers_version": "0.37.0.dev0",
 
 
 
 
 
 
4
  "hidden_channels": 32,
5
- "in_channels": 32,
6
  "leaky_relu_negative_slope": 0.1,
7
  "out_channels": 2,
8
  "output_sampling_rate": 24000,
 
1
  {
2
  "_class_name": "LTX2Vocoder",
3
+ "_diffusers_version": "0.38.0.dev0",
4
+ "act_fn": "leaky_relu",
5
+ "antialias": false,
6
+ "antialias_kernel_size": 12,
7
+ "antialias_ratio": 2,
8
+ "final_act_fn": "tanh",
9
+ "final_bias": true,
10
  "hidden_channels": 32,
11
+ "in_channels": 128,
12
  "leaky_relu_negative_slope": 0.1,
13
  "out_channels": 2,
14
  "output_sampling_rate": 24000,
vocoder/diffusion_pytorch_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9c337dfbd31cccec2be3ebc22db65ee3058c5f344aa58793bc52e9ed1ae67523
3
- size 29740
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:885a3ef39987df2a9840e963d611ec1ed4172af8532e84c772a8fdbb89335f17
3
+ size 143448