NZUONG commited on
Commit
55e451a
·
verified ·
1 Parent(s): 39ed077

Upload 15 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,14 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ samples/nam-calm.wav filter=lfs diff=lfs merge=lfs -text
37
+ samples/nam-cham.wav filter=lfs diff=lfs merge=lfs -text
38
+ samples/nam-nhanh.wav filter=lfs diff=lfs merge=lfs -text
39
+ samples/nam-truyen-cam.wav filter=lfs diff=lfs merge=lfs -text
40
+ samples/nu-calm.wav filter=lfs diff=lfs merge=lfs -text
41
+ samples/nu-cham.wav filter=lfs diff=lfs merge=lfs -text
42
+ samples/nu-luu-loat.wav filter=lfs diff=lfs merge=lfs -text
43
+ samples/nu-nhan-nha.wav filter=lfs diff=lfs merge=lfs -text
44
+ samples/nu-nhe-nhang.wav filter=lfs diff=lfs merge=lfs -text
45
+ user_sample.wav filter=lfs diff=lfs merge=lfs -text
46
+ vi_sample.wav filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,3 +1,49 @@
1
  ---
2
- license: apache-2.0
 
 
 
 
 
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ license: other
3
+ license_name: coqui-public-model-license
4
+ license_link: https://coqui.ai/cpml
5
+ pipeline_tag: text-to-speech
6
+ datasets:
7
+ - capleaf/viVoice
8
+ language:
9
+ - vi
10
  ---
11
+
12
+ # viⓍTTS
13
+
14
+ viⓍTTS là mô hình tạo sinh giọng nói cho phép bạn sao chép giọng nói sang các ngôn ngữ khác nhau chỉ bằng cách sử dụng một đoạn âm thanh nhanh dài 6 giây. Mô hình này được tiếp tục đào tạo từ mô hình [XTTS-v2.0.3](https://huggingface.co/coqui/XTTS-v2) bằng cách mở rộng tokenizer sang tiếng Việt và huấn luyện trên tập dữ liệu [viVoice](https://huggingface.co/datasets/thinhlpg/viVoice).
15
+
16
+ viⓍTTS is a voice generation model that lets you clone voices into different languages by using just a quick 6-second audio clip. This model is fine-tuned from the [XTTS-v2.0.3](https://huggingface.co/coqui/XTTS-v2) model by expanding the tokenizer to Vietnamese and fine-tuning on the [viVoice](https://huggingface.co/datasets/thinhlpg/viVoice) dataset.
17
+
18
+ ### Languages
19
+
20
+ viXTTS supports 18 languages: English (en), Spanish (es), French (fr), German (de), Italian (it), Portuguese (pt),
21
+ Polish (pl), Turkish (tr), Russian (ru), Dutch (nl), Czech (cs), Arabic (ar), Chinese (zh-cn), Japanese (ja), Hungarian (hu), Korean (ko)
22
+ Hindi (hi), **Vietnamese (vi)**.
23
+
24
+ ### Known Limitations
25
+
26
+ - Incompatibility with the [original TTS library](https://github.com/coqui-ai/TTS) (a pull request will be made later).
27
+ - Subpar performance for input sentences under 10 words in Vietnamese language (yielding inconsistent output and odd trailing sounds).
28
+ - This model is only fine-tuned in Vietnamese. The model's effectiveness with languages other than Vietnamese hasn't been tested, potentially reducing quality.
29
+
30
+ ### Demo
31
+
32
+ Please checkout [this repo](https://github.com/thinhlpg/vixtts-demo)
33
+
34
+ ### Usage
35
+
36
+ For a quick usage, please checkout [this notebook](https://colab.research.google.com/drive/1q9vA7mDyvK_u0ijDDNuycDoUUbryM3p3?usp=sharing)
37
+
38
+ ### License
39
+
40
+ This model is licensed under [Coqui Public Model License](https://coqui.ai/cpml).
41
+
42
+ ### Contact
43
+
44
+ Fine-tuned by Thinh Le at FPT University HCMC, as a component of [Non La](https://huggingface.co/capleaf)'s graduation thesis.
45
+ Contact:
46
+
47
+ - You can message me directly on Facebook: <https://fb.com/thinhlpg/> (preferred 🤗)
48
+ - GitHub: <https://github.com/thinhlpg>
49
+ - Email: <thinhlpg@gmail.com> or <thinhlpgse161384@fpt.edu.vn>
config.json ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "output_path": "output",
3
+ "logger_uri": null,
4
+ "run_name": "run",
5
+ "project_name": null,
6
+ "run_description": "viXTTS training",
7
+ "print_step": null,
8
+ "plot_step": null,
9
+ "model_param_stats": false,
10
+ "wandb_entity": null,
11
+ "dashboard_logger": "tensorboard",
12
+ "save_on_interrupt": true,
13
+ "log_model_step": null,
14
+ "save_step": 24000,
15
+ "save_n_checkpoints": 2,
16
+ "save_checkpoints": true,
17
+ "save_all_best": false,
18
+ "save_best_after": 0,
19
+ "target_loss": null,
20
+ "print_eval": true,
21
+ "test_delay_epochs": 0,
22
+ "run_eval": true,
23
+ "run_eval_steps": null,
24
+ "distributed_backend": "nccl",
25
+ "distributed_url": "tcp://localhost:54321",
26
+ "mixed_precision": false,
27
+ "precision": "fp16",
28
+ "epochs": 5,
29
+ "batch_size": 2,
30
+ "eval_batch_size": 2,
31
+ "grad_clip": 0.0,
32
+ "scheduler_after_epoch": true,
33
+ "lr": 5e-06,
34
+ "optimizer": "AdamW",
35
+ "optimizer_params": {
36
+ "betas": [
37
+ 0.9,
38
+ 0.96
39
+ ],
40
+ "eps": 1e-08,
41
+ "weight_decay": 0.01
42
+ },
43
+ "lr_scheduler": "MultiStepLR",
44
+ "lr_scheduler_params": {
45
+ "milestones": [
46
+ 900000,
47
+ 2700000,
48
+ 5400000
49
+ ],
50
+ "gamma": 0.5,
51
+ "last_epoch": -1
52
+ },
53
+ "use_grad_scaler": false,
54
+ "allow_tf32": false,
55
+ "cudnn_enable": true,
56
+ "cudnn_deterministic": false,
57
+ "cudnn_benchmark": false,
58
+ "training_seed": 1,
59
+ "model": "xtts",
60
+ "num_loader_workers": 0,
61
+ "num_eval_loader_workers": 0,
62
+ "use_noise_augment": false,
63
+ "audio": {
64
+ "sample_rate": 22050,
65
+ "output_sample_rate": 24000,
66
+ "dvae_sample_rate": 22050
67
+ },
68
+ "use_phonemes": false,
69
+ "phonemizer": null,
70
+ "phoneme_language": null,
71
+ "compute_input_seq_cache": false,
72
+ "text_cleaner": null,
73
+ "enable_eos_bos_chars": false,
74
+ "test_sentences_file": "",
75
+ "phoneme_cache_path": null,
76
+ "characters": null,
77
+ "add_blank": false,
78
+ "batch_group_size": 48,
79
+ "loss_masking": null,
80
+ "min_audio_len": 1,
81
+ "max_audio_len": Infinity,
82
+ "min_text_len": 1,
83
+ "max_text_len": Infinity,
84
+ "compute_f0": false,
85
+ "compute_energy": false,
86
+ "compute_linear_spec": false,
87
+ "precompute_num_workers": 0,
88
+ "start_by_longest": false,
89
+ "shuffle": false,
90
+ "drop_last": false,
91
+ "datasets": [
92
+ {
93
+ "formatter": "",
94
+ "dataset_name": "",
95
+ "path": "",
96
+ "meta_file_train": "",
97
+ "ignored_speakers": null,
98
+ "language": "",
99
+ "phonemizer": "",
100
+ "meta_file_val": "",
101
+ "meta_file_attn_mask": ""
102
+ }
103
+ ],
104
+ "test_sentences": [],
105
+ "eval_split_max_size": null,
106
+ "eval_split_size": 0.01,
107
+ "use_speaker_weighted_sampler": false,
108
+ "speaker_weighted_sampler_alpha": 1.0,
109
+ "use_language_weighted_sampler": false,
110
+ "language_weighted_sampler_alpha": 1.0,
111
+ "use_length_weighted_sampler": false,
112
+ "length_weighted_sampler_alpha": 1.0,
113
+ "model_args": {
114
+ "gpt_batch_size": 1,
115
+ "enable_redaction": false,
116
+ "kv_cache": true,
117
+ "gpt_checkpoint": null,
118
+ "clvp_checkpoint": null,
119
+ "decoder_checkpoint": null,
120
+ "num_chars": 255,
121
+ "tokenizer_file": "",
122
+ "gpt_max_audio_tokens": 605,
123
+ "gpt_max_text_tokens": 402,
124
+ "gpt_max_prompt_tokens": 70,
125
+ "gpt_layers": 30,
126
+ "gpt_n_model_channels": 1024,
127
+ "gpt_n_heads": 16,
128
+ "gpt_number_text_tokens": 7544,
129
+ "gpt_start_text_token": null,
130
+ "gpt_stop_text_token": null,
131
+ "gpt_num_audio_tokens": 1026,
132
+ "gpt_start_audio_token": 1024,
133
+ "gpt_stop_audio_token": 1025,
134
+ "gpt_code_stride_len": 1024,
135
+ "gpt_use_masking_gt_prompt_approach": true,
136
+ "gpt_use_perceiver_resampler": true,
137
+ "input_sample_rate": 22050,
138
+ "output_sample_rate": 24000,
139
+ "output_hop_length": 256,
140
+ "decoder_input_dim": 1024,
141
+ "d_vector_dim": 512,
142
+ "cond_d_vector_in_each_upsampling_layer": true,
143
+ "duration_const": 102400
144
+ },
145
+ "model_dir": null,
146
+ "languages": [
147
+ "en",
148
+ "es",
149
+ "fr",
150
+ "de",
151
+ "it",
152
+ "pt",
153
+ "pl",
154
+ "tr",
155
+ "ru",
156
+ "nl",
157
+ "cs",
158
+ "ar",
159
+ "zh-cn",
160
+ "hu",
161
+ "ko",
162
+ "ja",
163
+ "hi",
164
+ "vi"
165
+ ],
166
+ "temperature": 0.85,
167
+ "length_penalty": 1.0,
168
+ "repetition_penalty": 2.0,
169
+ "top_k": 50,
170
+ "top_p": 0.85,
171
+ "num_gpt_outputs": 1,
172
+ "gpt_cond_len": 12,
173
+ "gpt_cond_chunk_len": 4,
174
+ "max_ref_len": 10,
175
+ "sound_norm_refs": false
176
+ }
model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:534670e4b752002b7d7224e6ea1f467bd608c8dd3c36efaa45e1f4696e8bd1d2
3
+ size 1875343894
samples/nam-calm.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:274758c6b991f29e53261aa98bb9a7d5aad37c8a8776968845982e750681e00c
3
+ size 744014
samples/nam-cham.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:586be2752c4de191f094cd786a49ae7f14d7d8b8d5c9df8ea0ed8e385fd9f8ec
3
+ size 783950
samples/nam-nhanh.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f6dbf341ed3309f79a718a9beea59d4f3a5004ba4b6675dbb6382f41abf7ed5
3
+ size 646222
samples/nam-truyen-cam.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f9e430ec2e6d78a8f282126923f19ae45c12c937c77a5b023dc501954fc4fa4
3
+ size 875598
samples/nu-calm.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a023eb447f851673cc9a9f61ca9b99263724d31d057004eb37b84925c9b5bc6
3
+ size 759374
samples/nu-cham.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:acee1c4cc033c27fd78a0bd4f7041d78db59abf1b075ce3e5d4141bb35ef1508
3
+ size 933454
samples/nu-luu-loat.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a120871b168489a33b7f3188764b0f973583bf5284bd96cd805d9e6256a7e45
3
+ size 710734
samples/nu-nhan-nha.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c039491a0d5e44be9bd24222277d39dc3db1cecb6e6f9270fcc99a997837dd0
3
+ size 763982
samples/nu-nhe-nhang.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0f6a0e132a37aa6d28a610eed3daa151309c4a98453d1da1d94d9e88c8438f8c
3
+ size 793166
user_sample.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f6d8d594013b1fdd3c6b8c1d095735510899cd0d56bd2003f8367b6aced2e094
3
+ size 5670316
vi_sample.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0f6a0e132a37aa6d28a610eed3daa151309c4a98453d1da1d94d9e88c8438f8c
3
+ size 793166
vocab.json ADDED
The diff for this file is too large to render. See raw diff