julse commited on
Commit
a0727ad
·
verified ·
1 Parent(s): 34da6e2

Update model/codon_attr.py

Browse files
Files changed (1) hide show
  1. model/codon_attr.py +582 -371
model/codon_attr.py CHANGED
@@ -1,384 +1,595 @@
1
- #!/usr/bin/env python
2
- # -*- coding: utf-8 -*-
3
- """
4
- Title : drawRNA.py
5
- project : web
6
- Created by: julse
7
- Created on: 2025/7/4 14:24
8
- des: TODO
9
- """
10
-
11
- import sys
12
  import os
13
- import time
14
 
15
  import pandas as pd
16
  import numpy as np
17
-
18
- import gradio as gr
19
- import pandas as pd
20
- import numpy as np
21
- import os
22
- import tempfile
23
- import subprocess
24
- from PIL import Image
25
-
26
- # 定义颜色样式 - 固定使用一组预定义颜色
27
- COLORS = [
28
- '#FF0000', # 红色 - UTR5
29
- '#0000FF', # 蓝色 - CDS起始区
30
- '#FFC0CB', # 粉色 - CDS终止区
31
- '#FFA500', # 橙色 - UTR3
32
- '#FFFF00', # 黄色 - 起始密码子
33
- '#800080' # 紫色 - 终止密码子
34
- ]
35
-
36
- COLOR_MAP = {
37
- 'UTR5': '#FF0000', # 红色
38
- 'CDS_start': '#0000FF', # 蓝色 - CDS起始区
39
- 'CDS_mid': '#00FF00', # 绿色 - CDS中间区(添加的)
40
- 'CDS_end': '#FFC0CB', # 粉色 - CDS终止区
41
- 'UTR3': '#FFA500', # 橙色
42
- 'start_codon': '#FFFF00', # 黄色 - 起始密码子
43
- 'stop_codon': '#800080', # 紫色 - 终止密码子
44
- 'intron': '#A9A9A9', # 灰色 - 内含子(添加的)
45
- 'exon': '#90EE90', # 浅绿色 - 外显子(添加的)
46
- }
47
-
48
- def get_bases_index(utr5, cds, utr3):
49
- """计算各区域的位置索引"""
50
- start_codon_idx = len(utr5)
51
- stop_codon_idx = len(utr5) + len(cds)
52
-
53
- # UTR5区域
54
- utr5_start = max(0, start_codon_idx - 300)
55
- utr5_range = list(range(utr5_start + 1, start_codon_idx + 1))
56
-
57
- # CDS起始区(不包括起始密码子)
58
- cds_start = start_codon_idx + 3
59
- cds_end = min(start_codon_idx + 300, stop_codon_idx - 3)
60
- start_codon_range = list(range(cds_start + 1, cds_end + 1))
61
-
62
- # CDS终止区(不包括终止密码子)
63
- cds_start = max(start_codon_idx, stop_codon_idx - 300)
64
- stop_codon_range = list(range(cds_start + 1, stop_codon_idx - 2))
65
-
66
- # UTR3区域
67
- utr3_range = list(range(stop_codon_idx + 1, min(stop_codon_idx + 301, stop_codon_idx + len(utr3) + 1)))
68
-
69
- # 起始密码子 (3个碱基)
70
- start_codon = list(range(start_codon_idx + 1, start_codon_idx + 4))
71
-
72
- # 终止密码子 (3个碱基)
73
- stop_codon = list(range(stop_codon_idx - 2, stop_codon_idx + 1))
74
-
75
- # 转换为逗号分隔的字符串
76
- return (
77
- ",".join(map(str, utr5_range)),
78
- ",".join(map(str, start_codon_range)),
79
- ",".join(map(str, stop_codon_range)),
80
- ",".join(map(str, utr3_range)),
81
- ",".join(map(str, start_codon)),
82
- ",".join(map(str, stop_codon))
83
- )
84
-
85
-
86
- def calc_mfe(seq):
87
- import RNA
88
-
89
- fc = RNA.fold_compound(seq)
90
- ss, mfe = fc.mfe()
91
- return ss, mfe
92
-
93
-
94
- def dbn_to_tuple(dbn, c1_region=[], c2_region=[]):
95
- # 构建配对字典
96
- stack, pairs = [], {}
97
- for i, char in enumerate(dbn):
98
- if char == '(':
99
- stack.append(i)
100
- elif char == ')':
101
- j = stack.pop()
102
- if len(c1_region) == 0 or len(c2_region) == 0:
103
- pairs[i + 1] = j + 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  else:
105
- if i + 1 in c2_region and j + 1 in c1_region:
106
- pairs[i + 1] = j + 1
107
- return pairs
108
-
109
-
110
- def run_cmd(command, output_file):
111
- # 执行命令
112
- result = subprocess.run(command, capture_output=True, text=True)
113
- # 检查是否执行成功
114
- if result.returncode != 0:
115
- error_msg = f"执行VARNA命令时出错:\n{result.stderr}"
116
- os.unlink(output_file) # 删除临时文件
117
- raise RuntimeError(error_msg)
118
-
119
- # 检查文件是否成功创建
120
- if not os.path.exists(output_file):
121
- raise FileNotFoundError("未能生成结构图文件")
122
-
123
-
124
-
125
-
126
- def run_draw_rna_advanced(full_sequence, structure, utr5_range, start_codon_range,
127
- stop_codon_range, utr3_range, start_codon, stop_codon,
128
- focus_region, auxBPs, output_file,algorithm, title=''):
129
- import matplotlib.pyplot as plt
130
- from draw_rna.ipynb_draw import draw_struct
131
-
132
- # 解析输入
133
- utr5_range = eval(utr5_range)
134
- start_codon_range = eval(start_codon_range)
135
- stop_codon_range = eval(stop_codon_range)
136
- utr3_range = eval(utr3_range)
137
- start_codon = eval(start_codon)
138
- stop_codon = eval(stop_codon)
139
-
140
- # 定义颜色方
141
-
142
- # 颜色映射
143
- COLOR_MAP = {
144
- 'UTR5': '#FF0000', # 红色
145
- 'CDS_start': '#0000FF', # 蓝色 - CDS起始区
146
- 'CDS_end': '#FFC0CB', # 粉色 - CDS终止区
147
- 'UTR3': '#FFA500', # 橙色
148
- 'start_codon': '#FFFF00', # 黄色 - 起始密码子
149
- 'stop_codon': '#800080', # 紫色 - 终止密码子
150
- 'default': '#808080' # 灰色
151
- }
152
-
153
-
154
- # 区域到数值的映射
155
- region_to_value = {
156
- 'default':0,
157
- 'UTR5': 1,
158
- 'CDS_start': 2,
159
- 'CDS_end': 3,
160
- 'UTR3': 4,
161
- 'start_codon': 5,
162
- 'stop_codon': 6
163
- }
164
-
165
- # 自定义colormap
166
- from matplotlib.colors import ListedColormap
167
-
168
- # 创建自定义颜色列表,按照数值顺序
169
- custom_colors = [
170
- COLOR_MAP['default'], # 0: 灰色
171
- COLOR_MAP['UTR5'], # 1: 红色
172
- COLOR_MAP['CDS_start'], # 2: 蓝色
173
- COLOR_MAP['CDS_end'], # 3: 粉色
174
- COLOR_MAP['UTR3'], # 4: 橙色
175
- COLOR_MAP['start_codon'], # 5: 黄色
176
- COLOR_MAP['stop_codon'] # 6: 紫色
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
  ]
178
 
179
- custom_cmap = ListedColormap(custom_colors)
180
- # 创建数值数组,每个数值对应一种颜色
181
-
182
- colors = [region_to_value['default']]*len(full_sequence)
183
- for i in utr5_range:
184
- colors[i-1]= region_to_value['UTR5']
185
- for i in utr3_range:
186
- colors[i-1] = region_to_value['UTR3']
187
- for i in start_codon_range:
188
- colors[i-1] = region_to_value['CDS_start']
189
- for i in stop_codon_range:
190
- colors[i-1] = region_to_value['CDS_end']
191
- for i in start_codon:
192
- colors[i-1] = region_to_value['start_codon']
193
- for i in stop_codon:
194
- colors[i-1] = region_to_value['stop_codon']
195
-
196
- draw_struct(full_sequence, structure,
197
- c = colors,
198
- cmap = custom_cmap,
199
- vmin = 0,
200
- vmax = 6,
201
- line=algorithm,
202
- )
203
- # 添加图例
204
- color_scheme = COLOR_MAP
205
- legend_elements = [
206
- plt.Rectangle((0, 0), 1, 1, facecolor=color_scheme['UTR5'], edgecolor='black', label="5'UTR"),
207
- plt.Rectangle((0, 0), 1, 1, facecolor=color_scheme['CDS_start'], edgecolor='black', label="CDS Start"),
208
- plt.Rectangle((0, 0), 1, 1, facecolor=color_scheme['CDS_end'], edgecolor='black', label="CDS End"),
209
- plt.Rectangle((0, 0), 1, 1, facecolor=color_scheme['UTR3'], edgecolor='black', label="3'UTR"),
210
- plt.Rectangle((0, 0), 1, 1, facecolor=color_scheme['start_codon'], edgecolor='black', label="Start Codon"),
211
- plt.Rectangle((0, 0), 1, 1, facecolor=color_scheme['stop_codon'], edgecolor='black', label="Stop Codon"),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
212
  ]
213
 
214
- plt.legend(handles=legend_elements, loc='upper left', bbox_to_anchor=(1.05, 1), fontsize=10)
215
-
216
- # 调整布局并保存
217
- plt.savefig(output_file, dpi=300, bbox_inches='tight')
218
- plt.close()
219
-
220
- print(f"Successfully created: {output_file}")
221
-
222
- def draw_simple(utr5_seq, title=''):
223
- img_paths = []
224
- stru5, mfe = calc_mfe(utr5_seq)
225
- import matplotlib.pyplot as plt
226
- from draw_rna.ipynb_draw import draw_struct
227
- draw_struct(utr5_seq, stru5)
228
-
229
- # 创建临时文件
230
- with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmpfile:
231
- output_file = tmpfile.name
232
- img_paths.append(output_file)
233
- plt.title(title)
234
- # 保存当前活动的图形
235
- plt.savefig(output_file,
236
- dpi=300,
237
- bbox_inches='tight',
238
- facecolor='white',
239
- edgecolor='none')
240
- return img_paths, mfe, stru5
241
-
242
-
243
-
244
-
245
- def generate_rna_structure(utr5_seq, cds_seq, utr3_seq, structure, draw_2d=["mRNA"]):
246
- """生成RNA结构图"""
247
- message = ""
248
- # 组合完整序列
249
- full_sequence = utr5_seq + cds_seq + utr3_seq
250
- mfe = None
251
- img_paths = []
252
-
253
- if "Full mRNA" in draw_2d:
254
- if structure == "":
255
- structure, mfe = calc_mfe(full_sequence)
256
- # 验证序列和结构长度匹配
257
- if len(full_sequence) != len(structure):
258
- return f"序列长度({len(full_sequence)})与结构长度({len(structure)})不匹配"
259
- '''full mRNA'''
260
- # 获取各区域位置
261
- utr5_range, start_codon_range, stop_codon_range, utr3_range, start_codon, stop_codon = get_bases_index(
262
- utr5_seq, cds_seq, utr3_seq
263
- )
264
- focus_region = f'{min(eval(utr5_range))}-{max(eval(start_codon_range))}:fill=#bcffdd;{min(eval(stop_codon_range))}-{max(eval(utr3_range))}:fill=#bcffdd'
265
- pairs = dbn_to_tuple(structure, c1_region=eval(','.join([utr5_range, start_codon, start_codon_range])),
266
- c2_region=eval(','.join([stop_codon_range, utr3_range, stop_codon])))
267
- auxBPs = ';'.join([f'({key},{value}):color=#6ed86e' for key, value in pairs.items()])
268
- for algorithm in ["line", "naview"]:
269
- # 创建临时文件
270
- with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmpfile:
271
- output_file = tmpfile.name
272
- img_paths.append((output_file,f'mRNA_{algorithm}'))
273
- # algorithm = "line" # 线条算法
274
- # 构建VARNA命令, write to file
275
- # run_VARNA(full_sequence, structure, utr5_range, start_codon_range, stop_codon_range, utr3_range,
276
- # start_codon, stop_codon, focus_region, auxBPs, output_file, algorithm, title='mRNA')
277
- #
278
-
279
-
280
- algorithm = algorithm=="line" # 线条算法
281
- # 构建VARNA命令, write to file
282
- run_draw_rna_advanced(full_sequence, structure, utr5_range, start_codon_range, stop_codon_range, utr3_range,
283
- start_codon, stop_codon, focus_region, auxBPs, output_file,algorithm, title='mRNA')
284
-
285
- if "5'leader (30 nt)" in draw_2d:
286
- img_path, local_mfe, stru5 = draw_simple(full_sequence[:30], title="5'leader (30 nt))")
287
- img_paths.extend((img_path,'head_30'))
288
- message += f"\nhead(30nt) MFE={local_mfe:.2f} kcal/mol"
289
- if "5'UTR" in draw_2d:
290
- img_path, local_mfe, stru5 = draw_simple(utr5_seq, title="5'UTR")
291
- img_paths.extend((img_path,'utr5'))
292
- message += f"\n5'UTR MFE={local_mfe:.2f} kcal/mol"
293
- if "CDS" in draw_2d:
294
- img_path, local_mfe, stru5 = draw_simple(cds_seq, title="CDS")
295
- img_paths.extend((img_path,'cds'))
296
- message += f"\nCDS MFE={local_mfe:.2f} kcal/mol"
297
- if "3'UTR" in draw_2d:
298
- img_path, local_mfe, stru5 = draw_simple(utr3_seq, title="3'UTR")
299
- img_paths.extend((img_path,'utr3'))
300
- message += f"\n3'UTR MFE={local_mfe:.2f} kcal/mol"
301
-
302
- return img_paths, mfe, structure, message
303
-
304
-
305
- def visualize_rna(utr5_seq, cds_seq, utr3_seq, structure):
306
- """可视化RNA结构的主函数"""
307
- # 生成RNA结构图
308
- image_path, mfe, structure, message = generate_rna_structure(utr5_seq, cds_seq, utr3_seq, structure)
309
- mfe = f'MFE={mfe:.2f} kcal/mol' if mfe else None
310
-
311
- # 返回图像
312
- return image_path, mfe, structure, message
313
-
314
-
315
- def draw_rna_2d():
316
- # 创建Gradio界面
317
- with gr.Blocks(title="RNA结构可视化") as demo:
318
- gr.Markdown("# RNA结构可视化工具")
319
- gr.Markdown("使用VARNA可视化RNA二级结构,并高亮显示不同区域")
320
-
321
- with gr.Row():
322
- with gr.Column(scale=1):
323
- utr5_seq = gr.Textbox(label="5'UTR序列", value="AUGCCAUGAACAGCUAC", placeholder="输入5'UTR序列...")
324
- cds_seq = gr.Textbox(label="CDS序列", value="AUGCCAUGAACAGCUAC", placeholder="输入CDS序列...")
325
- utr3_seq = gr.Textbox(label="3'UTR序列", value="AUGCCAUGAACAGCUAC", placeholder="输入3'UTR序列...")
326
- structure = gr.Textbox(
327
- label="二级结构",
328
- value="...........((((.((((.((((........)))).))))...))))..",
329
- placeholder="输入点括号表示的二级结构..."
330
- )
331
- submit_btn = gr.Button("生成结构图", variant="primary")
332
-
333
- with gr.Column():
334
- # output_image = gr.Image(label="RNA结构图", interactive=False)
335
- output_image = gr.Gallery(label="RNA结构图", interactive=False, object_fit="contain")
336
- mfe = gr.Markdown(label="MFE", value="")
337
- message = gr.Markdown(label="Message", value="")
338
-
339
- # 颜色图例
340
- with gr.Accordion("颜色说明", open=False):
341
- gr.Markdown("""
342
- | 颜色 | 区域 |
343
- |------|------|
344
- | <span style="color:red">■</span> 红色 | 5'UTR 区域 |
345
- | <span style="color:blue">■</span> 蓝色 | CDS起始区域 |
346
- | <span style="color:#FFC0CB">■</span> 粉色 | CDS终止区域 |
347
- | <span style="color:orange">■</span> 橙色 | 3'UTR 区域 |
348
- | <span style="color:yellow">■</span> 黄色 | 起始密码子 (AUG) |
349
- | <span style="color:purple">■</span> 紫色 | 终止密码子 (UAA, UAG, UGA) |
350
- """)
351
-
352
- # 示例数据
353
- with gr.Accordion("示例数据", open=False):
354
- gr.Examples(
355
- examples=[
356
- [
357
- "AUGCCAUGAACAGCUAC",
358
- "AUGCCAUGAACAGCUAC",
359
- "AUGCCAUGAACAGCUAC",
360
- "...........((((.((((.((((........)))).))))...)))).."
361
- ],
362
- [
363
- "GGGAAAUUUCCC",
364
- "AUGCCAUGAACAGCUAC",
365
- "UUUAAAGGGCCC",
366
- "((((....))))..(((.((((.......))))...))).."
367
- ]
368
- ],
369
- inputs=[utr5_seq, cds_seq, utr3_seq, structure]
370
- )
371
 
372
- # 提交处理
373
- submit_btn.click(
374
- visualize_rna,
375
- inputs=[utr5_seq, cds_seq, utr3_seq, structure],
376
- outputs=[output_image, mfe, structure, message]
377
- )
378
- return demo
379
 
380
 
381
- # 运行应用
382
  if __name__ == "__main__":
383
- demo = draw_rna_2d()
384
- demo.launch(server_port=8080, debug=True)
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
+ import random
3
 
4
  import pandas as pd
5
  import numpy as np
6
+ from typing import List, Dict, Tuple, Union
7
+ from collections import defaultdict
8
+
9
+
10
+ class Codon:
11
+ CODON_TO_AA = {
12
+ 'UUU': 'F', 'UUC': 'F', # Phe (2-fold)
13
+ 'UUA': 'L', 'UUG': 'L', 'CUU': 'L', 'CUC': 'L', 'CUA': 'L', 'CUG': 'L', # Leu (6-fold)
14
+ 'AUU': 'I', 'AUC': 'I', 'AUA': 'I', # Ile (3-fold)
15
+ 'AUG': 'M', # Met (无同义密码子,排除)
16
+ 'GUU': 'V', 'GUC': 'V', 'GUA': 'V', 'GUG': 'V', # Val (4-fold)
17
+ 'UCU': 'S', 'UCC': 'S', 'UCA': 'S', 'UCG': 'S', 'AGU': 'S', 'AGC': 'S', # Ser (6-fold)
18
+ 'CCU': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P', # Pro (4-fold)
19
+ 'ACU': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', # Thr (4-fold)
20
+ 'GCU': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', # Ala (4-fold)
21
+ 'UAU': 'Y', 'UAC': 'Y', # Tyr (2-fold)
22
+ 'UAA': '*', 'UAG': '*', 'UGA': '*', # 终止密码子 (排除)
23
+ 'CAU': 'H', 'CAC': 'H', # His (2-fold)
24
+ 'CAA': 'Q', 'CAG': 'Q', # Gln (2-fold)
25
+ 'AAU': 'N', 'AAC': 'N', # Asn (2-fold)
26
+ 'AAA': 'K', 'AAG': 'K', # Lys (2-fold)
27
+ 'GAU': 'D', 'GAC': 'D', # Asp (2-fold)
28
+ 'GAA': 'E', 'GAG': 'E', # Glu (2-fold)
29
+ 'UGU': 'C', 'UGC': 'C', # Cys (2-fold)
30
+ 'UGG': 'W', # Trp (无同义密码子,排除)
31
+ 'CGU': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 'AGA': 'R', 'AGG': 'R', # Arg (6-fold)
32
+ 'GGU': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G' # Gly (4-fold)
33
+ }
34
+ def __init__(self, codon_usage_path, rna=True):
35
+ self.bases = 'GAUC'
36
+ self.aas = 'ACDEFGHIKLMNPQRSTVWY*'.lower()
37
+ self.codon_table = {}
38
+ self.max_aa_table = {}
39
+ self.cai_best_aa2nn_table = {}
40
+ self.frame_ith_aa_base_fraction = {
41
+ i: {
42
+ a: {
43
+ base: 0.0 for base in self.bases
44
+ } for a in self.aas
45
+ } for i in range(3)
46
+ }
47
+ # 1: {'A': {'A': 0.0, 'C': 0.0, 'G': 0.0, 'U': 0.0},
48
+ # 'C': {'A': 0.0, 'C': 0.0, 'G': 0.0, 'U': 0.0},
49
+ # 'G': {'A': 0.0, 'C': 0.0, 'G': 0.0, 'U': 0.0},
50
+ # 'U': {'A': 0.0, 'C': 0.0, 'G': 0.0, 'U': 0.0}},
51
+
52
+ # self.frame_ith_aa_base_fraction = {0: defaultdict(list), 1: defaultdict(list), 2: defaultdict(list)}
53
+ # self.frame_ith_aa_base_fraction = {i:{a:{base:defaultdict(float)} for a in self.aas for base in self.bases} for i in range(3)}
54
+ # rna参数现在只用于控制输出格式,输入可以是RNA或DNA
55
+ self.output_rna = rna
56
+
57
+ # RNA标准密码子表(用于ENC和RSCU计算)
58
+ self.standard_codon_table = self.CODON_TO_AA
59
+
60
+ # 按简并度预分组氨基酸
61
+ self.degeneracy_groups = {
62
+ '2-fold': ['F', 'Y', 'C', 'H', 'Q', 'N', 'K', 'D', 'E'],
63
+ '3-fold': ['I'],
64
+ '4-fold': ['V', 'P', 'T', 'A', 'G'],
65
+ '6-fold': ['L', 'S', 'R']
66
+ }
67
+
68
+ # print(f"\nOutput format: {'RNA' if self.output_rna else 'DNA'}")
69
+ # print(f"Loading codon usage table from {codon_usage_path}")
70
+ # print("suppose csv in the format columns: 'codon', 'amino_acid', 'fraction'\n")
71
+ if os.access(codon_usage_path, os.R_OK) and os.path.getsize(codon_usage_path) > 0:
72
+ with open(codon_usage_path, 'r') as codon_file:
73
+ next(codon_file) # Skip the header line
74
+ for line in codon_file:
75
+ line = line.strip()
76
+ if not line:
77
+ continue
78
+ codon, aa, fraction, *_ = line.split(',')
79
+ # 内部统一存储为RNA格式, AA 小写
80
+ codon = codon.upper().replace('T', 'U')
81
+ aa = aa.lower()
82
+ fraction = float(fraction)
83
+
84
+ self.codon_table[codon] = (aa, fraction)
85
+ for i,base in enumerate(codon):
86
+ # print(i,aa,base,fraction,self.frame_ith_aa_base_fraction[i][aa])
87
+ # self.frame_ith_table[i][aa].append((base, fraction))
88
+ self.frame_ith_aa_base_fraction[i][aa][base] = fraction + self.frame_ith_aa_base_fraction[i][aa][base]
89
+ # self.frame_ith_table[i][aa][base] = fraction + self.frame_ith_table[i][aa][base]
90
+ if aa not in self.max_aa_table or self.max_aa_table[aa] < fraction:
91
+ self.max_aa_table[aa] = fraction
92
+ self.cai_best_aa2nn_table[aa] = codon
93
+ # frame_ith_table = [self.frame_ith_table[i][aa] for aa in self.frame_ith_table[i] for i in range(3)]
94
+ print(f"Codon usage table loaded, {len(self.codon_table)} codons loaded from {codon_usage_path}")
95
+ else:
96
+ print(f'codon usage table is missing',codon_usage_path)
97
+
98
+ self.aa_to_codons = self._build_aa_to_codons()
99
+ # 预计算氨基酸到密码子权重的映射(用于加权随机)
100
+ self.aa_to_weights = self._build_aa_to_weights()
101
+ self.calculate_CAI = self.calc_cai
102
+
103
+ def _build_aa_to_codons(self):
104
+ """构建氨基酸到密码子列表的映射"""
105
+ aa_to_codons = defaultdict(list)
106
+ for codon, (aa, _) in self.codon_table.items():
107
+ aa_to_codons[aa].append(codon)
108
+ return dict(aa_to_codons)
109
+
110
+ def _build_aa_to_weights(self):
111
+ """构建氨基酸到密码子权重的映射"""
112
+ aa_to_weights = defaultdict(list)
113
+ for codon, (aa, weight) in self.codon_table.items():
114
+ aa_to_weights[aa].append(weight)
115
+ return dict(aa_to_weights)
116
+
117
+ def _normalize_sequence(self, sequence: str) -> str:
118
+ """标准化序列为RNA格式"""
119
+ sequence = sequence.upper()
120
+ # 将DNA转换为RNA格式(内部统一使用RNA)
121
+ sequence = sequence.replace('T', 'U')
122
+ return sequence
123
+
124
+ def _validate_sequence(self, sequence: str) -> str:
125
+ """验证并标准化序列"""
126
+ sequence = self._normalize_sequence(sequence)
127
+
128
+ if len(sequence) % 3 != 0:
129
+ raise ValueError(f"序列长度必须是3的倍数,当前长度: {len(sequence)}")
130
+
131
+ valid_bases = {'A', 'U', 'C', 'G'}
132
+ if not all(base in valid_bases for base in sequence):
133
+ raise ValueError("序列包含无效的碱基字符")
134
+
135
+ return sequence
136
+
137
+ def _count_codons(self, sequence: str) -> Dict[str, int]:
138
+ """统计序列中密码子使用次数"""
139
+ sequence = self._validate_sequence(sequence)
140
+ codon_count = {}
141
+ num_codons = len(sequence) // 3
142
+
143
+ for i in range(num_codons):
144
+ codon = sequence[i * 3:(i + 1) * 3]
145
+ if codon in self.standard_codon_table and self.standard_codon_table[codon] != '*':
146
+ codon_count[codon] = codon_count.get(codon, 0) + 1
147
+
148
+ return codon_count
149
+
150
+ @staticmethod
151
+ def translate_sequence(sequence: str) -> str:
152
+ """将序列翻译为氨基酸序列"""
153
+ sequence = sequence.upper().replace('T', 'U')
154
+ aa_seq = ''
155
+ for i in range(0, len(sequence), 3):
156
+ codon = sequence[i:i + 3]
157
+ if codon in Codon.CODON_TO_AA:
158
+ aa = Codon.CODON_TO_AA[codon]
159
+ aa_seq += aa
160
+ return aa_seq
161
+ def calc_cai(self, seq):
162
+ """计算CAI值,输入可以是RNA或DNA序列"""
163
+ # 标准化序列为RNA格式
164
+ seq = self._normalize_sequence(seq)
165
+ if len(seq) % 3 != 0:
166
+ # raise ValueError(f"序列长度必须是3的倍数, 当前长度: {len(seq)},{seq}")
167
+ return np.nan
168
+ cai = 0.0
169
+ valid_num = 0
170
+ for i in range(0, len(seq), 3):
171
+ codon = seq[i:i + 3]
172
+ if codon not in self.codon_table:
173
+ continue
174
+ aa, fraction = self.codon_table[codon]
175
+ f_c_max = self.max_aa_table[aa]
176
+
177
+ w_i = fraction / f_c_max
178
+ cai += np.log2(w_i)
179
+ valid_num += 1
180
+
181
+ return np.exp2(cai / valid_num) if valid_num > 0 else 0.0
182
+
183
+
184
+ def cai_opt_codon(self, aa_seq):
185
+ aa_seq = aa_seq.lower()
186
+ """获取CAI最优密码子序列"""
187
+ cai_opt_codon = []
188
+ for i in range(0, len(aa_seq), 1):
189
+ aa = aa_seq[i]
190
+ codon = self.cai_best_aa2nn_table.get(aa, '___')
191
+ # 根据输出格式转换
192
+ if not self.output_rna:
193
+ codon = codon.replace('U', 'T')
194
+ cai_opt_codon.append(codon)
195
+ return ''.join(cai_opt_codon)
196
+
197
+ def random_codon(self, aa_seq):
198
+ """
199
+ 根据密码子频率加权随机生成CDS序列
200
+
201
+ 参数:
202
+ aa_sequence (str): 氨基酸序列(单字母)
203
+
204
+ 返回:
205
+ str: 随机生成的DNA序列
206
+ """
207
+
208
+ aa_seq = aa_seq.lower()
209
+ opt_codon = []
210
+ for i in range(0, len(aa_seq), 1):
211
+ aa = aa_seq[i]
212
+
213
+ if aa not in self.aa_to_codons:
214
+ codon = '___'
215
+ else:
216
+ codons = self.aa_to_codons[aa] # ['AUG']
217
+ weights = self.aa_to_weights[aa] # [1.0]
218
+
219
+ codon = random.choices(codons, weights=weights, k=1)[0]
220
+ opt_codon.append(codon)
221
+
222
+ opt_nn = ''.join(opt_codon)
223
+ # 根据输出格式转换
224
+ if not self.output_rna:
225
+ opt_nn = opt_nn.replace('U', 'T')
226
+ return opt_nn
227
+
228
+ def random_codon_weight(self, aa_seq,weights_df=None):
229
+ """
230
+ 根据密码子频率加权随机生成CDS序列
231
+
232
+ 参数:
233
+ aa_sequence (str): 氨基酸序列(单字母)
234
+
235
+ 返回:
236
+ str: 随机生成的DNA序列
237
+ """
238
+ if weights_df is None:
239
+ return self.random_codon(aa_seq)
240
+
241
+ # weights_df.columns = ['triplet', 'amino_acid', 'fraction']
242
+ # weights_df_gp = weights_df.groupby(by='amino_acid')
243
+ aa_seq = aa_seq.lower()
244
+ opt_codon = []
245
+ for i in range(0, len(aa_seq), 1):
246
+ aa = aa_seq[i]
247
+
248
+ if aa not in self.aa_to_codons:
249
+ codon = '___'
250
  else:
251
+ tmp = weights_df[weights_df['amino_acid']==aa]
252
+ codon = random.choices(tmp['triplet'].to_list(), weights=tmp['fraction'].to_list(), k=1)[0]
253
+ opt_codon.append(codon)
254
+
255
+ opt_nn = ''.join(opt_codon)
256
+ # 根据输出格式转换
257
+ if not self.output_rna:
258
+ opt_nn = opt_nn.replace('U', 'T')
259
+ return opt_nn
260
+
261
+ def calculate_ENC(self, sequence: str) -> float:
262
+ """
263
+ 计算单条序列的ENC值,输入可以是RNA或DNA序列
264
+
265
+ 参数:
266
+ sequence: 序列字符串
267
+
268
+ 返回:
269
+ enc_value: ENC值
270
+ """
271
+ codon_count = self._count_codons(sequence)
272
+
273
+ # 按氨基酸分组
274
+ amino_acid_counts = {}
275
+ for codon, aa in self.standard_codon_table.items():
276
+ if aa in ['M', 'W'] or aa == '*':
277
+ continue
278
+ if aa not in amino_acid_counts:
279
+ amino_acid_counts[aa] = {}
280
+ amino_acid_counts[aa][codon] = codon_count.get(codon, 0)
281
+
282
+ # 计算每个氨基酸组的F值
283
+ F_values = {'2-fold': [], '3-fold': [], '4-fold': [], '6-fold': []}
284
+
285
+ for aa, codon_counts in amino_acid_counts.items():
286
+ # 确定简并度
287
+ degeneracy = None
288
+ for deg, aas in self.degeneracy_groups.items():
289
+ if aa in aas:
290
+ degeneracy = deg
291
+ break
292
+
293
+ if not degeneracy:
294
+ continue
295
+
296
+ # 获取该氨基酸的所有同义密码子
297
+ codons_for_aa = [c for c, a in self.standard_codon_table.items()
298
+ if a == aa and a not in ['M', 'W'] and a != '*']
299
+ s = len(codons_for_aa)
300
+
301
+ # 统计使用次数
302
+ n_i_values = [codon_counts.get(codon, 0) for codon in codons_for_aa]
303
+ total_n = sum(n_i_values)
304
+
305
+ if total_n == 0 or s <= 1:
306
+ continue
307
+
308
+ # 计算F值
309
+ sum_squared_freq = sum((n_i / total_n) ** 2 for n_i in n_i_values)
310
+ F = (s * sum_squared_freq - 1) / (s - 1)
311
+
312
+ F_values[degeneracy].append(F)
313
+
314
+ # 计算各简并度的平均F值
315
+ # F2_avg = np.mean(F_values['2-fold']) if F_values['2-fold'] else 1.0
316
+ # F3_avg = np.mean(F_values['3-fold']) if F_values['3-fold'] else 1.0
317
+ # F4_avg = np.mean(F_values['4-fold']) if F_values['4-fold'] else 1.0
318
+ # F6_avg = np.mean(F_values['6-fold']) if F_values['6-fold'] else 1.0
319
+ enc_value = 2.0
320
+
321
+ if F_values['2-fold']:
322
+ enc_value += 9.0 / np.mean(F_values['2-fold'])
323
+ if F_values['3-fold']:
324
+ enc_value += 1.0 / np.mean(F_values['3-fold'])
325
+ if F_values['4-fold']:
326
+ enc_value += 5.0 / np.mean(F_values['4-fold'])
327
+ if F_values['6-fold']:
328
+ enc_value += 3.0 / np.mean(F_values['6-fold'])
329
+ # 计算ENC值
330
+ # enc_value = 2 + 9 / F2_avg + 1 / F3_avg + 5 / F4_avg + 3 / F6_avg
331
+
332
+ return enc_value
333
+
334
+ def calculate_RSCU(self, sequences: List[str]) -> Dict[str, float]:
335
+ """
336
+ 计算相对同义密码子使用度 (Relative Synonymous Codon Usage, RSCU)
337
+
338
+ 参数:
339
+ sequences: 序列列表(可以是RNA或DNA)
340
+
341
+ 返回:
342
+ rscu_dict: 每个密码子的RSCU值字典(RNA格式)
343
+ """
344
+ total_codon_count = defaultdict(int)
345
+ aa_observed_codons = defaultdict(set)
346
+
347
+ # 统计所有序列的密码子使用
348
+ for seq in sequences:
349
+ try:
350
+ codon_count = self._count_codons(seq)
351
+ for codon, count in codon_count.items():
352
+ aa = self.standard_codon_table[codon]
353
+ total_codon_count[codon] += count
354
+ aa_observed_codons[aa].add(codon)
355
+ except ValueError:
356
+ continue # 跳过无效序列
357
+
358
+ # 计算RSCU
359
+ rscu_dict = {}
360
+ aa_total_count = defaultdict(int)
361
+
362
+ # 首先计算每个氨基酸的总密码子数
363
+ for codon, count in total_codon_count.items():
364
+ aa = self.standard_codon_table[codon]
365
+ aa_total_count[aa] += count
366
+
367
+ # 然后计算每个密码子的RSCU
368
+ for codon, count in total_codon_count.items():
369
+ aa = self.standard_codon_table[codon]
370
+ if aa_total_count[aa] > 0:
371
+ # 该氨基酸的同义密码子数量
372
+ synonymous_codons = len([c for c in aa_observed_codons[aa]
373
+ if self.standard_codon_table[c] == aa])
374
+ expected_count = aa_total_count[aa] / synonymous_codons
375
+ rscu_dict[codon] = count / expected_count if expected_count > 0 else 0.0
376
+ else:
377
+ rscu_dict[codon] = 0.0
378
+
379
+ return rscu_dict
380
+
381
+ def analyze_sequence(self, sequence: str, sequence_name: str = "") -> Dict:
382
+ """
383
+ 综合分析单条序列的密码子使用特征
384
+
385
+ 参数:
386
+ sequence: 序列字符串(可以是RNA或DNA)
387
+ sequence_name: 序列名称(可选)
388
+
389
+ 返回:
390
+ 包含所有指标的字典
391
+ """
392
+ try:
393
+ enc = self.calculate_ENC(sequence)
394
+ cai = self.calc_cai(sequence)
395
+ result = {
396
+ 'Sequence_Name': sequence_name,
397
+ 'Sequence_Length': len(sequence),
398
+ 'ENC': round(enc, 3),
399
+ 'ENC_Preference': 'strong' if enc <= 35 else 'week',
400
+ 'CAI': round(cai, 3),
401
+ 'CAI_Level': 'high' if cai > 0.7 else 'low'
402
+ }
403
+
404
+ return result
405
+
406
+ except Exception as e:
407
+ return {
408
+ 'Sequence_Name': sequence_name,
409
+ 'Sequence_Length': len(sequence),
410
+ 'ENC': None,
411
+ 'CAI': None,
412
+ 'Error': str(e)
413
+ }
414
+
415
+ @staticmethod
416
+ def modify_func(sequence):
417
+ return '_'*len(sequence)
418
+ @staticmethod
419
+ def modify_codon_by_frames(sequence, frames=[1,2,3], modify_func=None):
420
+ """
421
+ 高级版本:支持自定义修改函数
422
+
423
+ 参数:
424
+ sequence (str): 输入序列
425
+ frame (int): 要修改的密码子位置 (1, 2, 3)
426
+ modify_func (callable): 修改函数,接收原帧字符串,返回修改后的字符串
427
+
428
+ 返回:
429
+ str: 修改后的重建序列
430
+ """
431
+ # 清理序列
432
+ seq = sequence.upper().replace(' ', '').replace('\n', '')
433
+ seq = seq[:len(seq) - len(seq) % 3]
434
+
435
+ # 使用切片提取帧
436
+ frames = [seq[0::3], seq[1::3], seq[2::3]]
437
+
438
+ reconstructed_list =[]
439
+ # 应用修改函数
440
+ for frame in frames:
441
+ frame_index = frame - 1
442
+ if modify_func:
443
+ frames[frame_index] = modify_func(frames[frame_index])
444
+
445
+ # 重建序列
446
+ reconstructed = ''.join(
447
+ frames[0][i] + frames[1][i] + frames[2][i]
448
+ for i in range(len(frames[0]))
449
+ )
450
+ reconstructed_list.append(reconstructed)
451
+
452
+ return reconstructed_list
453
+
454
+ # 使用示例 - 测试所有功能
455
+ def example_usage():
456
+ """测试所有功能"""
457
+ print("=" * 60)
458
+ print("测试 Codon 类的所有功能")
459
+ print("=" * 60)
460
+
461
+ # 测试数据
462
+ species_list = ["mouse", "Ec", "Sac", "Pic", "Human"]
463
+ test_species = "mouse" # 选择一个物种进行详细测试
464
+
465
+ # 测试序列
466
+ aa_seq = "MASV"
467
+ dna_seq = "ATGGCCATGGCGCCCAGAACTGAGATCAAATAGTACCCGTATTAACGGGTA"
468
+ rna_seq = dna_seq.replace('T', 'U')
469
+ # 测试序列集合(用于RSCU计算)
470
+ test_sequences = [
471
+ "AUGGCUUCUUUUUUCUUCUUCUUCUUCUUCUUCCUCCUCCUCCUCCUCCUCCUCCUC", # RNA
472
+ "ATGGCUUCUUUUCUCGUAUACACAGATGACTACGTTAGCAGCTACGTTACGTTACGTTACG", # DNA
473
+ "AUGGUUUGUUGGUUGGUUGGUUGGUUGGUUGGUUGGUUGGUUGGUUGGUUGGUUGGA" # RNA
474
  ]
475
 
476
+ # 单个测试序列
477
+ test_sequence = "AUGGCUUCUUUUCUCGUAUACACAGAUGACUACGUAGCAGCUACGUACGUACGUACG"
478
+
479
+
480
+ Codon.translate_sequence(dna_seq) # 验证translate_sequence函数
481
+
482
+
483
+ # 假设的密码子使用表路径
484
+ codon_table_path = "/Users/gz_julse/code/minimind_RiboUTR/maotao_file/codon_table/codon_usage_{species}.csv"
485
+
486
+ print(f"\n1. 初始化 Codon 实例 (物种: {species_list})")
487
+ print("-" * 50)
488
+
489
+ # 创建分析器实例,输出格式为RNA和DNA各一个
490
+ codon_instance_dna = {species: Codon(codon_table_path.format(species=species), rna=False) for species in
491
+ species_list}
492
+ codon_instance_rna = {species: Codon(codon_table_path.format(species=species), rna=True) for species in
493
+ species_list}
494
+
495
+
496
+ print(f"✓ 成功创建 {len(species_list)} 个物种的 Codon 实例")
497
+
498
+ print(f"\n2. 测试 CAI 计算")
499
+ print("-" * 50)
500
+
501
+ # 测试DNA和RNA序列输入
502
+ print("DNA序列CAI:", [codon_instance_rna[species].calc_cai(dna_seq) for species in species_list])
503
+ print("RNA序列CAI:", [codon_instance_rna[species].calc_cai(rna_seq) for species in species_list])
504
+
505
+ # 验证DNA和RNA输入结果一致
506
+ dna_cai = codon_instance_rna[test_species].calc_cai(dna_seq)
507
+ rna_cai = codon_instance_rna[test_species].calc_cai(rna_seq)
508
+ print(f"✓ DNA和RNA输入结果一致: {np.isclose(dna_cai, rna_cai)}")
509
+
510
+ print(f"\n3. 测试 CAI 最优密码子序列")
511
+ print("-" * 50)
512
+
513
+ # 测试最优密码子序列
514
+ opt_rna = codon_instance_rna[test_species].cai_opt_codon(aa_seq)
515
+ opt_dna = codon_instance_dna[test_species].cai_opt_codon(aa_seq)
516
+
517
+ print(f"氨基酸序列: {aa_seq}")
518
+ print(f"RNA格式最优密码子: {opt_rna}")
519
+ print(f"DNA格式最优密码子: {opt_dna}")
520
+ print(f"✓ 输出格式正确: RNA={opt_rna.replace('T', '') == opt_rna}, DNA={opt_dna.replace('U', '') == opt_dna}")
521
+
522
+ print(f"\n4. 测试 ENC 计算")
523
+ print("-" * 50)
524
+
525
+ # 测试ENC计算
526
+ enc_dna = codon_instance_rna[test_species].calculate_ENC(dna_seq)
527
+ enc_rna = codon_instance_rna[test_species].calculate_ENC(rna_seq)
528
+
529
+ print(f"DNA序列ENC: {enc_dna:.3f}")
530
+ print(f"RNA序列ENC: {enc_rna:.3f}")
531
+ print(f"✓ DNA和RNA输入结果一致: {np.isclose(enc_dna, enc_rna)}")
532
+
533
+ print(f"\n5. 测试 RSCU 计算")
534
+ print("-" * 50)
535
+
536
+ # 测试RSCU计算
537
+ rscu_results = codon_instance_rna[test_species].calculate_RSCU(test_sequences)
538
+ print(f"计算了 {len(rscu_results)} 个密码子的RSCU值")
539
+ print("前10个密码子的RSCU值:")
540
+ for i, (codon, rscu) in enumerate(list(rscu_results.items())[:10]):
541
+ print(f" {codon}: {rscu:.3f}")
542
+
543
+ print(f"\n6. 测试综合分析 (analyze_sequence)")
544
+ print("-" * 50)
545
+
546
+ # 测试综合分析
547
+ analysis_result = codon_instance_rna[test_species].analyze_sequence(test_sequence, "Test_Gene")
548
+ print("综合分析结果:")
549
+ for key, value in analysis_result.items():
550
+ print(f" {key}: {value}")
551
+
552
+ print(f"\n7. 测试序列验证功能")
553
+ print("-" * 50)
554
+
555
+ # 测试无效序列
556
+ invalid_seqs = [
557
+ "AUGGCUUCUUUUCUCG", # 长度不是3的倍数
558
+ "AUGXXXUUUUCUCGUAUACACAGAUGACUACGUAGCAGCUACGUACGUACGUACG", # 包含无效字符
559
  ]
560
 
561
+ for i, seq in enumerate(invalid_seqs):
562
+ try:
563
+ codon_instance_rna[test_species]._validate_sequence(seq)
564
+ print(f"序列 {i + 1}: 错误地通过了验证")
565
+ except ValueError as e:
566
+ print(f"序列 {i + 1}: 正确捕获错误 - {e}")
567
+
568
+ print(f"\n8. 测试密码子计数")
569
+ print("-" * 50)
570
+
571
+ # 测试密码子计数
572
+ codon_count = codon_instance_rna[test_species]._count_codons(test_sequence)
573
+ print(f"序列 '{test_sequence[:20]}...' 的密码子计数:")
574
+ for codon, count in list(codon_count.items())[:5]:
575
+ print(f" {codon}: {count}")
576
+ print(f" ... (共 {len(codon_count)} 种密码子)")
577
+
578
+ print(f"\n9. 测试不同输出格式的兼容性")
579
+ print("-" * 50)
580
+
581
+ # 验证RNA和DNA输出实例的CAI计算相同
582
+ cai_rna_instance = codon_instance_rna[test_species].calc_cai(test_sequence)
583
+ cai_dna_instance = codon_instance_dna[test_species].calc_cai(test_sequence)
584
+
585
+ print(f"RNA输出实例CAI: {cai_rna_instance:.4f}")
586
+ print(f"DNA输出实例CAI: {cai_dna_instance:.4f}")
587
+ print(f"✓ 不同输出格式实例的CAI计算相同: {np.isclose(cai_rna_instance, cai_dna_instance)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
588
 
589
+ print(f"\n" + "=" * 60)
590
+ print("所有功能测试完成!")
591
+ print("=" * 60)
 
 
 
 
592
 
593
 
 
594
  if __name__ == "__main__":
595
+ example_usage()