Update model/codon_attr.py
Browse files- model/codon_attr.py +582 -371
model/codon_attr.py
CHANGED
|
@@ -1,384 +1,595 @@
|
|
| 1 |
-
#!/usr/bin/env python
|
| 2 |
-
# -*- coding: utf-8 -*-
|
| 3 |
-
"""
|
| 4 |
-
Title : drawRNA.py
|
| 5 |
-
project : web
|
| 6 |
-
Created by: julse
|
| 7 |
-
Created on: 2025/7/4 14:24
|
| 8 |
-
des: TODO
|
| 9 |
-
"""
|
| 10 |
-
|
| 11 |
-
import sys
|
| 12 |
import os
|
| 13 |
-
import
|
| 14 |
|
| 15 |
import pandas as pd
|
| 16 |
import numpy as np
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
#
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
"
|
| 80 |
-
"
|
| 81 |
-
",
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
else:
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 177 |
]
|
| 178 |
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
for
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 212 |
]
|
| 213 |
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
#
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
def generate_rna_structure(utr5_seq, cds_seq, utr3_seq, structure, draw_2d=["mRNA"]):
|
| 246 |
-
"""生成RNA结构图"""
|
| 247 |
-
message = ""
|
| 248 |
-
# 组合完整序列
|
| 249 |
-
full_sequence = utr5_seq + cds_seq + utr3_seq
|
| 250 |
-
mfe = None
|
| 251 |
-
img_paths = []
|
| 252 |
-
|
| 253 |
-
if "Full mRNA" in draw_2d:
|
| 254 |
-
if structure == "":
|
| 255 |
-
structure, mfe = calc_mfe(full_sequence)
|
| 256 |
-
# 验证序列和结构长度匹配
|
| 257 |
-
if len(full_sequence) != len(structure):
|
| 258 |
-
return f"序列长度({len(full_sequence)})与结构长度({len(structure)})不匹配"
|
| 259 |
-
'''full mRNA'''
|
| 260 |
-
# 获取各区域位置
|
| 261 |
-
utr5_range, start_codon_range, stop_codon_range, utr3_range, start_codon, stop_codon = get_bases_index(
|
| 262 |
-
utr5_seq, cds_seq, utr3_seq
|
| 263 |
-
)
|
| 264 |
-
focus_region = f'{min(eval(utr5_range))}-{max(eval(start_codon_range))}:fill=#bcffdd;{min(eval(stop_codon_range))}-{max(eval(utr3_range))}:fill=#bcffdd'
|
| 265 |
-
pairs = dbn_to_tuple(structure, c1_region=eval(','.join([utr5_range, start_codon, start_codon_range])),
|
| 266 |
-
c2_region=eval(','.join([stop_codon_range, utr3_range, stop_codon])))
|
| 267 |
-
auxBPs = ';'.join([f'({key},{value}):color=#6ed86e' for key, value in pairs.items()])
|
| 268 |
-
for algorithm in ["line", "naview"]:
|
| 269 |
-
# 创建临时文件
|
| 270 |
-
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmpfile:
|
| 271 |
-
output_file = tmpfile.name
|
| 272 |
-
img_paths.append((output_file,f'mRNA_{algorithm}'))
|
| 273 |
-
# algorithm = "line" # 线条算法
|
| 274 |
-
# 构建VARNA命令, write to file
|
| 275 |
-
# run_VARNA(full_sequence, structure, utr5_range, start_codon_range, stop_codon_range, utr3_range,
|
| 276 |
-
# start_codon, stop_codon, focus_region, auxBPs, output_file, algorithm, title='mRNA')
|
| 277 |
-
#
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
algorithm = algorithm=="line" # 线条算法
|
| 281 |
-
# 构建VARNA命令, write to file
|
| 282 |
-
run_draw_rna_advanced(full_sequence, structure, utr5_range, start_codon_range, stop_codon_range, utr3_range,
|
| 283 |
-
start_codon, stop_codon, focus_region, auxBPs, output_file,algorithm, title='mRNA')
|
| 284 |
-
|
| 285 |
-
if "5'leader (30 nt)" in draw_2d:
|
| 286 |
-
img_path, local_mfe, stru5 = draw_simple(full_sequence[:30], title="5'leader (30 nt))")
|
| 287 |
-
img_paths.extend((img_path,'head_30'))
|
| 288 |
-
message += f"\nhead(30nt) MFE={local_mfe:.2f} kcal/mol"
|
| 289 |
-
if "5'UTR" in draw_2d:
|
| 290 |
-
img_path, local_mfe, stru5 = draw_simple(utr5_seq, title="5'UTR")
|
| 291 |
-
img_paths.extend((img_path,'utr5'))
|
| 292 |
-
message += f"\n5'UTR MFE={local_mfe:.2f} kcal/mol"
|
| 293 |
-
if "CDS" in draw_2d:
|
| 294 |
-
img_path, local_mfe, stru5 = draw_simple(cds_seq, title="CDS")
|
| 295 |
-
img_paths.extend((img_path,'cds'))
|
| 296 |
-
message += f"\nCDS MFE={local_mfe:.2f} kcal/mol"
|
| 297 |
-
if "3'UTR" in draw_2d:
|
| 298 |
-
img_path, local_mfe, stru5 = draw_simple(utr3_seq, title="3'UTR")
|
| 299 |
-
img_paths.extend((img_path,'utr3'))
|
| 300 |
-
message += f"\n3'UTR MFE={local_mfe:.2f} kcal/mol"
|
| 301 |
-
|
| 302 |
-
return img_paths, mfe, structure, message
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
def visualize_rna(utr5_seq, cds_seq, utr3_seq, structure):
|
| 306 |
-
"""可视化RNA结构的主函数"""
|
| 307 |
-
# 生成RNA结构图
|
| 308 |
-
image_path, mfe, structure, message = generate_rna_structure(utr5_seq, cds_seq, utr3_seq, structure)
|
| 309 |
-
mfe = f'MFE={mfe:.2f} kcal/mol' if mfe else None
|
| 310 |
-
|
| 311 |
-
# 返回图像
|
| 312 |
-
return image_path, mfe, structure, message
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
def draw_rna_2d():
|
| 316 |
-
# 创建Gradio界面
|
| 317 |
-
with gr.Blocks(title="RNA结构可视化") as demo:
|
| 318 |
-
gr.Markdown("# RNA结构可视化工具")
|
| 319 |
-
gr.Markdown("使用VARNA可视化RNA二级结构,并高亮显示不同区域")
|
| 320 |
-
|
| 321 |
-
with gr.Row():
|
| 322 |
-
with gr.Column(scale=1):
|
| 323 |
-
utr5_seq = gr.Textbox(label="5'UTR序列", value="AUGCCAUGAACAGCUAC", placeholder="输入5'UTR序列...")
|
| 324 |
-
cds_seq = gr.Textbox(label="CDS序列", value="AUGCCAUGAACAGCUAC", placeholder="输入CDS序列...")
|
| 325 |
-
utr3_seq = gr.Textbox(label="3'UTR序列", value="AUGCCAUGAACAGCUAC", placeholder="输入3'UTR序列...")
|
| 326 |
-
structure = gr.Textbox(
|
| 327 |
-
label="二级结构",
|
| 328 |
-
value="...........((((.((((.((((........)))).))))...))))..",
|
| 329 |
-
placeholder="输入点括号表示的二级结构..."
|
| 330 |
-
)
|
| 331 |
-
submit_btn = gr.Button("生成结构图", variant="primary")
|
| 332 |
-
|
| 333 |
-
with gr.Column():
|
| 334 |
-
# output_image = gr.Image(label="RNA结构图", interactive=False)
|
| 335 |
-
output_image = gr.Gallery(label="RNA结构图", interactive=False, object_fit="contain")
|
| 336 |
-
mfe = gr.Markdown(label="MFE", value="")
|
| 337 |
-
message = gr.Markdown(label="Message", value="")
|
| 338 |
-
|
| 339 |
-
# 颜色图例
|
| 340 |
-
with gr.Accordion("颜色说明", open=False):
|
| 341 |
-
gr.Markdown("""
|
| 342 |
-
| 颜色 | 区域 |
|
| 343 |
-
|------|------|
|
| 344 |
-
| <span style="color:red">■</span> 红色 | 5'UTR 区域 |
|
| 345 |
-
| <span style="color:blue">■</span> 蓝色 | CDS起始区域 |
|
| 346 |
-
| <span style="color:#FFC0CB">■</span> 粉色 | CDS终止区域 |
|
| 347 |
-
| <span style="color:orange">■</span> 橙色 | 3'UTR 区域 |
|
| 348 |
-
| <span style="color:yellow">■</span> 黄色 | 起始密码子 (AUG) |
|
| 349 |
-
| <span style="color:purple">■</span> 紫色 | 终止密码子 (UAA, UAG, UGA) |
|
| 350 |
-
""")
|
| 351 |
-
|
| 352 |
-
# 示例数据
|
| 353 |
-
with gr.Accordion("示例数据", open=False):
|
| 354 |
-
gr.Examples(
|
| 355 |
-
examples=[
|
| 356 |
-
[
|
| 357 |
-
"AUGCCAUGAACAGCUAC",
|
| 358 |
-
"AUGCCAUGAACAGCUAC",
|
| 359 |
-
"AUGCCAUGAACAGCUAC",
|
| 360 |
-
"...........((((.((((.((((........)))).))))...)))).."
|
| 361 |
-
],
|
| 362 |
-
[
|
| 363 |
-
"GGGAAAUUUCCC",
|
| 364 |
-
"AUGCCAUGAACAGCUAC",
|
| 365 |
-
"UUUAAAGGGCCC",
|
| 366 |
-
"((((....))))..(((.((((.......))))...))).."
|
| 367 |
-
]
|
| 368 |
-
],
|
| 369 |
-
inputs=[utr5_seq, cds_seq, utr3_seq, structure]
|
| 370 |
-
)
|
| 371 |
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
inputs=[utr5_seq, cds_seq, utr3_seq, structure],
|
| 376 |
-
outputs=[output_image, mfe, structure, message]
|
| 377 |
-
)
|
| 378 |
-
return demo
|
| 379 |
|
| 380 |
|
| 381 |
-
# 运行应用
|
| 382 |
if __name__ == "__main__":
|
| 383 |
-
|
| 384 |
-
demo.launch(server_port=8080, debug=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
+
import random
|
| 3 |
|
| 4 |
import pandas as pd
|
| 5 |
import numpy as np
|
| 6 |
+
from typing import List, Dict, Tuple, Union
|
| 7 |
+
from collections import defaultdict
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class Codon:
|
| 11 |
+
CODON_TO_AA = {
|
| 12 |
+
'UUU': 'F', 'UUC': 'F', # Phe (2-fold)
|
| 13 |
+
'UUA': 'L', 'UUG': 'L', 'CUU': 'L', 'CUC': 'L', 'CUA': 'L', 'CUG': 'L', # Leu (6-fold)
|
| 14 |
+
'AUU': 'I', 'AUC': 'I', 'AUA': 'I', # Ile (3-fold)
|
| 15 |
+
'AUG': 'M', # Met (无同义密码子,排除)
|
| 16 |
+
'GUU': 'V', 'GUC': 'V', 'GUA': 'V', 'GUG': 'V', # Val (4-fold)
|
| 17 |
+
'UCU': 'S', 'UCC': 'S', 'UCA': 'S', 'UCG': 'S', 'AGU': 'S', 'AGC': 'S', # Ser (6-fold)
|
| 18 |
+
'CCU': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P', # Pro (4-fold)
|
| 19 |
+
'ACU': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', # Thr (4-fold)
|
| 20 |
+
'GCU': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', # Ala (4-fold)
|
| 21 |
+
'UAU': 'Y', 'UAC': 'Y', # Tyr (2-fold)
|
| 22 |
+
'UAA': '*', 'UAG': '*', 'UGA': '*', # 终止密码子 (排除)
|
| 23 |
+
'CAU': 'H', 'CAC': 'H', # His (2-fold)
|
| 24 |
+
'CAA': 'Q', 'CAG': 'Q', # Gln (2-fold)
|
| 25 |
+
'AAU': 'N', 'AAC': 'N', # Asn (2-fold)
|
| 26 |
+
'AAA': 'K', 'AAG': 'K', # Lys (2-fold)
|
| 27 |
+
'GAU': 'D', 'GAC': 'D', # Asp (2-fold)
|
| 28 |
+
'GAA': 'E', 'GAG': 'E', # Glu (2-fold)
|
| 29 |
+
'UGU': 'C', 'UGC': 'C', # Cys (2-fold)
|
| 30 |
+
'UGG': 'W', # Trp (无同义密码子,排除)
|
| 31 |
+
'CGU': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 'AGA': 'R', 'AGG': 'R', # Arg (6-fold)
|
| 32 |
+
'GGU': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G' # Gly (4-fold)
|
| 33 |
+
}
|
| 34 |
+
def __init__(self, codon_usage_path, rna=True):
|
| 35 |
+
self.bases = 'GAUC'
|
| 36 |
+
self.aas = 'ACDEFGHIKLMNPQRSTVWY*'.lower()
|
| 37 |
+
self.codon_table = {}
|
| 38 |
+
self.max_aa_table = {}
|
| 39 |
+
self.cai_best_aa2nn_table = {}
|
| 40 |
+
self.frame_ith_aa_base_fraction = {
|
| 41 |
+
i: {
|
| 42 |
+
a: {
|
| 43 |
+
base: 0.0 for base in self.bases
|
| 44 |
+
} for a in self.aas
|
| 45 |
+
} for i in range(3)
|
| 46 |
+
}
|
| 47 |
+
# 1: {'A': {'A': 0.0, 'C': 0.0, 'G': 0.0, 'U': 0.0},
|
| 48 |
+
# 'C': {'A': 0.0, 'C': 0.0, 'G': 0.0, 'U': 0.0},
|
| 49 |
+
# 'G': {'A': 0.0, 'C': 0.0, 'G': 0.0, 'U': 0.0},
|
| 50 |
+
# 'U': {'A': 0.0, 'C': 0.0, 'G': 0.0, 'U': 0.0}},
|
| 51 |
+
|
| 52 |
+
# self.frame_ith_aa_base_fraction = {0: defaultdict(list), 1: defaultdict(list), 2: defaultdict(list)}
|
| 53 |
+
# self.frame_ith_aa_base_fraction = {i:{a:{base:defaultdict(float)} for a in self.aas for base in self.bases} for i in range(3)}
|
| 54 |
+
# rna参数现在只用于控制输出格式,输入可以是RNA或DNA
|
| 55 |
+
self.output_rna = rna
|
| 56 |
+
|
| 57 |
+
# RNA标准密码子表(用于ENC和RSCU计算)
|
| 58 |
+
self.standard_codon_table = self.CODON_TO_AA
|
| 59 |
+
|
| 60 |
+
# 按简并度预分组氨基酸
|
| 61 |
+
self.degeneracy_groups = {
|
| 62 |
+
'2-fold': ['F', 'Y', 'C', 'H', 'Q', 'N', 'K', 'D', 'E'],
|
| 63 |
+
'3-fold': ['I'],
|
| 64 |
+
'4-fold': ['V', 'P', 'T', 'A', 'G'],
|
| 65 |
+
'6-fold': ['L', 'S', 'R']
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
# print(f"\nOutput format: {'RNA' if self.output_rna else 'DNA'}")
|
| 69 |
+
# print(f"Loading codon usage table from {codon_usage_path}")
|
| 70 |
+
# print("suppose csv in the format columns: 'codon', 'amino_acid', 'fraction'\n")
|
| 71 |
+
if os.access(codon_usage_path, os.R_OK) and os.path.getsize(codon_usage_path) > 0:
|
| 72 |
+
with open(codon_usage_path, 'r') as codon_file:
|
| 73 |
+
next(codon_file) # Skip the header line
|
| 74 |
+
for line in codon_file:
|
| 75 |
+
line = line.strip()
|
| 76 |
+
if not line:
|
| 77 |
+
continue
|
| 78 |
+
codon, aa, fraction, *_ = line.split(',')
|
| 79 |
+
# 内部统一存储为RNA格式, AA 小写
|
| 80 |
+
codon = codon.upper().replace('T', 'U')
|
| 81 |
+
aa = aa.lower()
|
| 82 |
+
fraction = float(fraction)
|
| 83 |
+
|
| 84 |
+
self.codon_table[codon] = (aa, fraction)
|
| 85 |
+
for i,base in enumerate(codon):
|
| 86 |
+
# print(i,aa,base,fraction,self.frame_ith_aa_base_fraction[i][aa])
|
| 87 |
+
# self.frame_ith_table[i][aa].append((base, fraction))
|
| 88 |
+
self.frame_ith_aa_base_fraction[i][aa][base] = fraction + self.frame_ith_aa_base_fraction[i][aa][base]
|
| 89 |
+
# self.frame_ith_table[i][aa][base] = fraction + self.frame_ith_table[i][aa][base]
|
| 90 |
+
if aa not in self.max_aa_table or self.max_aa_table[aa] < fraction:
|
| 91 |
+
self.max_aa_table[aa] = fraction
|
| 92 |
+
self.cai_best_aa2nn_table[aa] = codon
|
| 93 |
+
# frame_ith_table = [self.frame_ith_table[i][aa] for aa in self.frame_ith_table[i] for i in range(3)]
|
| 94 |
+
print(f"Codon usage table loaded, {len(self.codon_table)} codons loaded from {codon_usage_path}")
|
| 95 |
+
else:
|
| 96 |
+
print(f'codon usage table is missing',codon_usage_path)
|
| 97 |
+
|
| 98 |
+
self.aa_to_codons = self._build_aa_to_codons()
|
| 99 |
+
# 预计算氨基酸到密码子权重的映射(用于加权随机)
|
| 100 |
+
self.aa_to_weights = self._build_aa_to_weights()
|
| 101 |
+
self.calculate_CAI = self.calc_cai
|
| 102 |
+
|
| 103 |
+
def _build_aa_to_codons(self):
|
| 104 |
+
"""构建氨基酸到密码子列表的映射"""
|
| 105 |
+
aa_to_codons = defaultdict(list)
|
| 106 |
+
for codon, (aa, _) in self.codon_table.items():
|
| 107 |
+
aa_to_codons[aa].append(codon)
|
| 108 |
+
return dict(aa_to_codons)
|
| 109 |
+
|
| 110 |
+
def _build_aa_to_weights(self):
|
| 111 |
+
"""构建氨基酸到密码子权重的映射"""
|
| 112 |
+
aa_to_weights = defaultdict(list)
|
| 113 |
+
for codon, (aa, weight) in self.codon_table.items():
|
| 114 |
+
aa_to_weights[aa].append(weight)
|
| 115 |
+
return dict(aa_to_weights)
|
| 116 |
+
|
| 117 |
+
def _normalize_sequence(self, sequence: str) -> str:
|
| 118 |
+
"""标准化序列为RNA格式"""
|
| 119 |
+
sequence = sequence.upper()
|
| 120 |
+
# 将DNA转换为RNA格式(内部统一使用RNA)
|
| 121 |
+
sequence = sequence.replace('T', 'U')
|
| 122 |
+
return sequence
|
| 123 |
+
|
| 124 |
+
def _validate_sequence(self, sequence: str) -> str:
|
| 125 |
+
"""验证并标准化序列"""
|
| 126 |
+
sequence = self._normalize_sequence(sequence)
|
| 127 |
+
|
| 128 |
+
if len(sequence) % 3 != 0:
|
| 129 |
+
raise ValueError(f"序列长度必须是3的倍数,当前长度: {len(sequence)}")
|
| 130 |
+
|
| 131 |
+
valid_bases = {'A', 'U', 'C', 'G'}
|
| 132 |
+
if not all(base in valid_bases for base in sequence):
|
| 133 |
+
raise ValueError("序列包含无效的碱基字符")
|
| 134 |
+
|
| 135 |
+
return sequence
|
| 136 |
+
|
| 137 |
+
def _count_codons(self, sequence: str) -> Dict[str, int]:
|
| 138 |
+
"""统计序列中密码子使用次数"""
|
| 139 |
+
sequence = self._validate_sequence(sequence)
|
| 140 |
+
codon_count = {}
|
| 141 |
+
num_codons = len(sequence) // 3
|
| 142 |
+
|
| 143 |
+
for i in range(num_codons):
|
| 144 |
+
codon = sequence[i * 3:(i + 1) * 3]
|
| 145 |
+
if codon in self.standard_codon_table and self.standard_codon_table[codon] != '*':
|
| 146 |
+
codon_count[codon] = codon_count.get(codon, 0) + 1
|
| 147 |
+
|
| 148 |
+
return codon_count
|
| 149 |
+
|
| 150 |
+
@staticmethod
|
| 151 |
+
def translate_sequence(sequence: str) -> str:
|
| 152 |
+
"""将序列翻译为氨基酸序列"""
|
| 153 |
+
sequence = sequence.upper().replace('T', 'U')
|
| 154 |
+
aa_seq = ''
|
| 155 |
+
for i in range(0, len(sequence), 3):
|
| 156 |
+
codon = sequence[i:i + 3]
|
| 157 |
+
if codon in Codon.CODON_TO_AA:
|
| 158 |
+
aa = Codon.CODON_TO_AA[codon]
|
| 159 |
+
aa_seq += aa
|
| 160 |
+
return aa_seq
|
| 161 |
+
def calc_cai(self, seq):
|
| 162 |
+
"""计算CAI值,输入可以是RNA或DNA序列"""
|
| 163 |
+
# 标准化序列为RNA格式
|
| 164 |
+
seq = self._normalize_sequence(seq)
|
| 165 |
+
if len(seq) % 3 != 0:
|
| 166 |
+
# raise ValueError(f"序列长度必须是3的倍数, 当前长度: {len(seq)},{seq}")
|
| 167 |
+
return np.nan
|
| 168 |
+
cai = 0.0
|
| 169 |
+
valid_num = 0
|
| 170 |
+
for i in range(0, len(seq), 3):
|
| 171 |
+
codon = seq[i:i + 3]
|
| 172 |
+
if codon not in self.codon_table:
|
| 173 |
+
continue
|
| 174 |
+
aa, fraction = self.codon_table[codon]
|
| 175 |
+
f_c_max = self.max_aa_table[aa]
|
| 176 |
+
|
| 177 |
+
w_i = fraction / f_c_max
|
| 178 |
+
cai += np.log2(w_i)
|
| 179 |
+
valid_num += 1
|
| 180 |
+
|
| 181 |
+
return np.exp2(cai / valid_num) if valid_num > 0 else 0.0
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
def cai_opt_codon(self, aa_seq):
|
| 185 |
+
aa_seq = aa_seq.lower()
|
| 186 |
+
"""获取CAI最优密码子序列"""
|
| 187 |
+
cai_opt_codon = []
|
| 188 |
+
for i in range(0, len(aa_seq), 1):
|
| 189 |
+
aa = aa_seq[i]
|
| 190 |
+
codon = self.cai_best_aa2nn_table.get(aa, '___')
|
| 191 |
+
# 根据输出格式转换
|
| 192 |
+
if not self.output_rna:
|
| 193 |
+
codon = codon.replace('U', 'T')
|
| 194 |
+
cai_opt_codon.append(codon)
|
| 195 |
+
return ''.join(cai_opt_codon)
|
| 196 |
+
|
| 197 |
+
def random_codon(self, aa_seq):
|
| 198 |
+
"""
|
| 199 |
+
根据密码子频率加权随机生成CDS序列
|
| 200 |
+
|
| 201 |
+
参数:
|
| 202 |
+
aa_sequence (str): 氨基酸序列(单字母)
|
| 203 |
+
|
| 204 |
+
返回:
|
| 205 |
+
str: 随机生成的DNA序列
|
| 206 |
+
"""
|
| 207 |
+
|
| 208 |
+
aa_seq = aa_seq.lower()
|
| 209 |
+
opt_codon = []
|
| 210 |
+
for i in range(0, len(aa_seq), 1):
|
| 211 |
+
aa = aa_seq[i]
|
| 212 |
+
|
| 213 |
+
if aa not in self.aa_to_codons:
|
| 214 |
+
codon = '___'
|
| 215 |
+
else:
|
| 216 |
+
codons = self.aa_to_codons[aa] # ['AUG']
|
| 217 |
+
weights = self.aa_to_weights[aa] # [1.0]
|
| 218 |
+
|
| 219 |
+
codon = random.choices(codons, weights=weights, k=1)[0]
|
| 220 |
+
opt_codon.append(codon)
|
| 221 |
+
|
| 222 |
+
opt_nn = ''.join(opt_codon)
|
| 223 |
+
# 根据输出格式转换
|
| 224 |
+
if not self.output_rna:
|
| 225 |
+
opt_nn = opt_nn.replace('U', 'T')
|
| 226 |
+
return opt_nn
|
| 227 |
+
|
| 228 |
+
def random_codon_weight(self, aa_seq,weights_df=None):
|
| 229 |
+
"""
|
| 230 |
+
根据密码子频率加权随机生成CDS序列
|
| 231 |
+
|
| 232 |
+
参数:
|
| 233 |
+
aa_sequence (str): 氨基酸序列(单字母)
|
| 234 |
+
|
| 235 |
+
返回:
|
| 236 |
+
str: 随机生成的DNA序列
|
| 237 |
+
"""
|
| 238 |
+
if weights_df is None:
|
| 239 |
+
return self.random_codon(aa_seq)
|
| 240 |
+
|
| 241 |
+
# weights_df.columns = ['triplet', 'amino_acid', 'fraction']
|
| 242 |
+
# weights_df_gp = weights_df.groupby(by='amino_acid')
|
| 243 |
+
aa_seq = aa_seq.lower()
|
| 244 |
+
opt_codon = []
|
| 245 |
+
for i in range(0, len(aa_seq), 1):
|
| 246 |
+
aa = aa_seq[i]
|
| 247 |
+
|
| 248 |
+
if aa not in self.aa_to_codons:
|
| 249 |
+
codon = '___'
|
| 250 |
else:
|
| 251 |
+
tmp = weights_df[weights_df['amino_acid']==aa]
|
| 252 |
+
codon = random.choices(tmp['triplet'].to_list(), weights=tmp['fraction'].to_list(), k=1)[0]
|
| 253 |
+
opt_codon.append(codon)
|
| 254 |
+
|
| 255 |
+
opt_nn = ''.join(opt_codon)
|
| 256 |
+
# 根据输出格式转换
|
| 257 |
+
if not self.output_rna:
|
| 258 |
+
opt_nn = opt_nn.replace('U', 'T')
|
| 259 |
+
return opt_nn
|
| 260 |
+
|
| 261 |
+
def calculate_ENC(self, sequence: str) -> float:
|
| 262 |
+
"""
|
| 263 |
+
计算单条序列的ENC值,输入可以是RNA或DNA序列
|
| 264 |
+
|
| 265 |
+
参数:
|
| 266 |
+
sequence: 序列字符串
|
| 267 |
+
|
| 268 |
+
返回:
|
| 269 |
+
enc_value: ENC值
|
| 270 |
+
"""
|
| 271 |
+
codon_count = self._count_codons(sequence)
|
| 272 |
+
|
| 273 |
+
# 按氨基酸分组
|
| 274 |
+
amino_acid_counts = {}
|
| 275 |
+
for codon, aa in self.standard_codon_table.items():
|
| 276 |
+
if aa in ['M', 'W'] or aa == '*':
|
| 277 |
+
continue
|
| 278 |
+
if aa not in amino_acid_counts:
|
| 279 |
+
amino_acid_counts[aa] = {}
|
| 280 |
+
amino_acid_counts[aa][codon] = codon_count.get(codon, 0)
|
| 281 |
+
|
| 282 |
+
# 计算每个氨基酸组的F值
|
| 283 |
+
F_values = {'2-fold': [], '3-fold': [], '4-fold': [], '6-fold': []}
|
| 284 |
+
|
| 285 |
+
for aa, codon_counts in amino_acid_counts.items():
|
| 286 |
+
# 确定简并度
|
| 287 |
+
degeneracy = None
|
| 288 |
+
for deg, aas in self.degeneracy_groups.items():
|
| 289 |
+
if aa in aas:
|
| 290 |
+
degeneracy = deg
|
| 291 |
+
break
|
| 292 |
+
|
| 293 |
+
if not degeneracy:
|
| 294 |
+
continue
|
| 295 |
+
|
| 296 |
+
# 获取该氨基酸的所有同义密码子
|
| 297 |
+
codons_for_aa = [c for c, a in self.standard_codon_table.items()
|
| 298 |
+
if a == aa and a not in ['M', 'W'] and a != '*']
|
| 299 |
+
s = len(codons_for_aa)
|
| 300 |
+
|
| 301 |
+
# 统计使用次数
|
| 302 |
+
n_i_values = [codon_counts.get(codon, 0) for codon in codons_for_aa]
|
| 303 |
+
total_n = sum(n_i_values)
|
| 304 |
+
|
| 305 |
+
if total_n == 0 or s <= 1:
|
| 306 |
+
continue
|
| 307 |
+
|
| 308 |
+
# 计算F值
|
| 309 |
+
sum_squared_freq = sum((n_i / total_n) ** 2 for n_i in n_i_values)
|
| 310 |
+
F = (s * sum_squared_freq - 1) / (s - 1)
|
| 311 |
+
|
| 312 |
+
F_values[degeneracy].append(F)
|
| 313 |
+
|
| 314 |
+
# 计算各简并度的平均F值
|
| 315 |
+
# F2_avg = np.mean(F_values['2-fold']) if F_values['2-fold'] else 1.0
|
| 316 |
+
# F3_avg = np.mean(F_values['3-fold']) if F_values['3-fold'] else 1.0
|
| 317 |
+
# F4_avg = np.mean(F_values['4-fold']) if F_values['4-fold'] else 1.0
|
| 318 |
+
# F6_avg = np.mean(F_values['6-fold']) if F_values['6-fold'] else 1.0
|
| 319 |
+
enc_value = 2.0
|
| 320 |
+
|
| 321 |
+
if F_values['2-fold']:
|
| 322 |
+
enc_value += 9.0 / np.mean(F_values['2-fold'])
|
| 323 |
+
if F_values['3-fold']:
|
| 324 |
+
enc_value += 1.0 / np.mean(F_values['3-fold'])
|
| 325 |
+
if F_values['4-fold']:
|
| 326 |
+
enc_value += 5.0 / np.mean(F_values['4-fold'])
|
| 327 |
+
if F_values['6-fold']:
|
| 328 |
+
enc_value += 3.0 / np.mean(F_values['6-fold'])
|
| 329 |
+
# 计算ENC值
|
| 330 |
+
# enc_value = 2 + 9 / F2_avg + 1 / F3_avg + 5 / F4_avg + 3 / F6_avg
|
| 331 |
+
|
| 332 |
+
return enc_value
|
| 333 |
+
|
| 334 |
+
def calculate_RSCU(self, sequences: List[str]) -> Dict[str, float]:
|
| 335 |
+
"""
|
| 336 |
+
计算相对同义密码子使用度 (Relative Synonymous Codon Usage, RSCU)
|
| 337 |
+
|
| 338 |
+
参数:
|
| 339 |
+
sequences: 序列列表(可以是RNA或DNA)
|
| 340 |
+
|
| 341 |
+
返回:
|
| 342 |
+
rscu_dict: 每个密码子的RSCU值字典(RNA格式)
|
| 343 |
+
"""
|
| 344 |
+
total_codon_count = defaultdict(int)
|
| 345 |
+
aa_observed_codons = defaultdict(set)
|
| 346 |
+
|
| 347 |
+
# 统计所有序列的密码子使用
|
| 348 |
+
for seq in sequences:
|
| 349 |
+
try:
|
| 350 |
+
codon_count = self._count_codons(seq)
|
| 351 |
+
for codon, count in codon_count.items():
|
| 352 |
+
aa = self.standard_codon_table[codon]
|
| 353 |
+
total_codon_count[codon] += count
|
| 354 |
+
aa_observed_codons[aa].add(codon)
|
| 355 |
+
except ValueError:
|
| 356 |
+
continue # 跳过无效序列
|
| 357 |
+
|
| 358 |
+
# 计算RSCU
|
| 359 |
+
rscu_dict = {}
|
| 360 |
+
aa_total_count = defaultdict(int)
|
| 361 |
+
|
| 362 |
+
# 首先计算每个氨基酸的总密码子数
|
| 363 |
+
for codon, count in total_codon_count.items():
|
| 364 |
+
aa = self.standard_codon_table[codon]
|
| 365 |
+
aa_total_count[aa] += count
|
| 366 |
+
|
| 367 |
+
# 然后计算每个密码子的RSCU
|
| 368 |
+
for codon, count in total_codon_count.items():
|
| 369 |
+
aa = self.standard_codon_table[codon]
|
| 370 |
+
if aa_total_count[aa] > 0:
|
| 371 |
+
# 该氨基酸的同义密码子数量
|
| 372 |
+
synonymous_codons = len([c for c in aa_observed_codons[aa]
|
| 373 |
+
if self.standard_codon_table[c] == aa])
|
| 374 |
+
expected_count = aa_total_count[aa] / synonymous_codons
|
| 375 |
+
rscu_dict[codon] = count / expected_count if expected_count > 0 else 0.0
|
| 376 |
+
else:
|
| 377 |
+
rscu_dict[codon] = 0.0
|
| 378 |
+
|
| 379 |
+
return rscu_dict
|
| 380 |
+
|
| 381 |
+
def analyze_sequence(self, sequence: str, sequence_name: str = "") -> Dict:
|
| 382 |
+
"""
|
| 383 |
+
综合分析单条序列的密码子使用特征
|
| 384 |
+
|
| 385 |
+
参数:
|
| 386 |
+
sequence: 序列字符串(可以是RNA或DNA)
|
| 387 |
+
sequence_name: 序列名称(可选)
|
| 388 |
+
|
| 389 |
+
返回:
|
| 390 |
+
包含所有指标的字典
|
| 391 |
+
"""
|
| 392 |
+
try:
|
| 393 |
+
enc = self.calculate_ENC(sequence)
|
| 394 |
+
cai = self.calc_cai(sequence)
|
| 395 |
+
result = {
|
| 396 |
+
'Sequence_Name': sequence_name,
|
| 397 |
+
'Sequence_Length': len(sequence),
|
| 398 |
+
'ENC': round(enc, 3),
|
| 399 |
+
'ENC_Preference': 'strong' if enc <= 35 else 'week',
|
| 400 |
+
'CAI': round(cai, 3),
|
| 401 |
+
'CAI_Level': 'high' if cai > 0.7 else 'low'
|
| 402 |
+
}
|
| 403 |
+
|
| 404 |
+
return result
|
| 405 |
+
|
| 406 |
+
except Exception as e:
|
| 407 |
+
return {
|
| 408 |
+
'Sequence_Name': sequence_name,
|
| 409 |
+
'Sequence_Length': len(sequence),
|
| 410 |
+
'ENC': None,
|
| 411 |
+
'CAI': None,
|
| 412 |
+
'Error': str(e)
|
| 413 |
+
}
|
| 414 |
+
|
| 415 |
+
@staticmethod
|
| 416 |
+
def modify_func(sequence):
|
| 417 |
+
return '_'*len(sequence)
|
| 418 |
+
@staticmethod
|
| 419 |
+
def modify_codon_by_frames(sequence, frames=[1,2,3], modify_func=None):
|
| 420 |
+
"""
|
| 421 |
+
高级版本:支持自定义修改函数
|
| 422 |
+
|
| 423 |
+
参数:
|
| 424 |
+
sequence (str): 输入序列
|
| 425 |
+
frame (int): 要修改的密码子位置 (1, 2, 3)
|
| 426 |
+
modify_func (callable): 修改函数,接收原帧字符串,返回修改后的字符串
|
| 427 |
+
|
| 428 |
+
返回:
|
| 429 |
+
str: 修改后的重建序列
|
| 430 |
+
"""
|
| 431 |
+
# 清理序列
|
| 432 |
+
seq = sequence.upper().replace(' ', '').replace('\n', '')
|
| 433 |
+
seq = seq[:len(seq) - len(seq) % 3]
|
| 434 |
+
|
| 435 |
+
# 使用切片提取帧
|
| 436 |
+
frames = [seq[0::3], seq[1::3], seq[2::3]]
|
| 437 |
+
|
| 438 |
+
reconstructed_list =[]
|
| 439 |
+
# 应用修改函数
|
| 440 |
+
for frame in frames:
|
| 441 |
+
frame_index = frame - 1
|
| 442 |
+
if modify_func:
|
| 443 |
+
frames[frame_index] = modify_func(frames[frame_index])
|
| 444 |
+
|
| 445 |
+
# 重建序列
|
| 446 |
+
reconstructed = ''.join(
|
| 447 |
+
frames[0][i] + frames[1][i] + frames[2][i]
|
| 448 |
+
for i in range(len(frames[0]))
|
| 449 |
+
)
|
| 450 |
+
reconstructed_list.append(reconstructed)
|
| 451 |
+
|
| 452 |
+
return reconstructed_list
|
| 453 |
+
|
| 454 |
+
# 使用示例 - 测试所有功能
|
| 455 |
+
def example_usage():
|
| 456 |
+
"""测试所有功能"""
|
| 457 |
+
print("=" * 60)
|
| 458 |
+
print("测试 Codon 类的所有功能")
|
| 459 |
+
print("=" * 60)
|
| 460 |
+
|
| 461 |
+
# 测试数据
|
| 462 |
+
species_list = ["mouse", "Ec", "Sac", "Pic", "Human"]
|
| 463 |
+
test_species = "mouse" # 选择一个物种进行详细测试
|
| 464 |
+
|
| 465 |
+
# 测试序列
|
| 466 |
+
aa_seq = "MASV"
|
| 467 |
+
dna_seq = "ATGGCCATGGCGCCCAGAACTGAGATCAAATAGTACCCGTATTAACGGGTA"
|
| 468 |
+
rna_seq = dna_seq.replace('T', 'U')
|
| 469 |
+
# 测试序列集合(用于RSCU计算)
|
| 470 |
+
test_sequences = [
|
| 471 |
+
"AUGGCUUCUUUUUUCUUCUUCUUCUUCUUCUUCCUCCUCCUCCUCCUCCUCCUCCUC", # RNA
|
| 472 |
+
"ATGGCUUCUUUUCUCGUAUACACAGATGACTACGTTAGCAGCTACGTTACGTTACGTTACG", # DNA
|
| 473 |
+
"AUGGUUUGUUGGUUGGUUGGUUGGUUGGUUGGUUGGUUGGUUGGUUGGUUGGUUGGA" # RNA
|
| 474 |
]
|
| 475 |
|
| 476 |
+
# 单个测试序列
|
| 477 |
+
test_sequence = "AUGGCUUCUUUUCUCGUAUACACAGAUGACUACGUAGCAGCUACGUACGUACGUACG"
|
| 478 |
+
|
| 479 |
+
|
| 480 |
+
Codon.translate_sequence(dna_seq) # 验证translate_sequence函数
|
| 481 |
+
|
| 482 |
+
|
| 483 |
+
# 假设的密码子使用表路径
|
| 484 |
+
codon_table_path = "/Users/gz_julse/code/minimind_RiboUTR/maotao_file/codon_table/codon_usage_{species}.csv"
|
| 485 |
+
|
| 486 |
+
print(f"\n1. 初始化 Codon 实例 (物种: {species_list})")
|
| 487 |
+
print("-" * 50)
|
| 488 |
+
|
| 489 |
+
# 创建分析器实例,输出格式为RNA和DNA各一个
|
| 490 |
+
codon_instance_dna = {species: Codon(codon_table_path.format(species=species), rna=False) for species in
|
| 491 |
+
species_list}
|
| 492 |
+
codon_instance_rna = {species: Codon(codon_table_path.format(species=species), rna=True) for species in
|
| 493 |
+
species_list}
|
| 494 |
+
|
| 495 |
+
|
| 496 |
+
print(f"✓ 成功创建 {len(species_list)} 个物种的 Codon 实例")
|
| 497 |
+
|
| 498 |
+
print(f"\n2. 测试 CAI 计算")
|
| 499 |
+
print("-" * 50)
|
| 500 |
+
|
| 501 |
+
# 测试DNA和RNA序列输入
|
| 502 |
+
print("DNA序列CAI:", [codon_instance_rna[species].calc_cai(dna_seq) for species in species_list])
|
| 503 |
+
print("RNA序列CAI:", [codon_instance_rna[species].calc_cai(rna_seq) for species in species_list])
|
| 504 |
+
|
| 505 |
+
# 验证DNA和RNA输入结果一致
|
| 506 |
+
dna_cai = codon_instance_rna[test_species].calc_cai(dna_seq)
|
| 507 |
+
rna_cai = codon_instance_rna[test_species].calc_cai(rna_seq)
|
| 508 |
+
print(f"✓ DNA和RNA输入结果一致: {np.isclose(dna_cai, rna_cai)}")
|
| 509 |
+
|
| 510 |
+
print(f"\n3. 测试 CAI 最优密码子序列")
|
| 511 |
+
print("-" * 50)
|
| 512 |
+
|
| 513 |
+
# 测试最优密码子序列
|
| 514 |
+
opt_rna = codon_instance_rna[test_species].cai_opt_codon(aa_seq)
|
| 515 |
+
opt_dna = codon_instance_dna[test_species].cai_opt_codon(aa_seq)
|
| 516 |
+
|
| 517 |
+
print(f"氨基酸序列: {aa_seq}")
|
| 518 |
+
print(f"RNA格式最优密码子: {opt_rna}")
|
| 519 |
+
print(f"DNA格式最优密码子: {opt_dna}")
|
| 520 |
+
print(f"✓ 输出格式正确: RNA={opt_rna.replace('T', '') == opt_rna}, DNA={opt_dna.replace('U', '') == opt_dna}")
|
| 521 |
+
|
| 522 |
+
print(f"\n4. 测试 ENC 计算")
|
| 523 |
+
print("-" * 50)
|
| 524 |
+
|
| 525 |
+
# 测试ENC计算
|
| 526 |
+
enc_dna = codon_instance_rna[test_species].calculate_ENC(dna_seq)
|
| 527 |
+
enc_rna = codon_instance_rna[test_species].calculate_ENC(rna_seq)
|
| 528 |
+
|
| 529 |
+
print(f"DNA序列ENC: {enc_dna:.3f}")
|
| 530 |
+
print(f"RNA序列ENC: {enc_rna:.3f}")
|
| 531 |
+
print(f"✓ DNA和RNA输入结果一致: {np.isclose(enc_dna, enc_rna)}")
|
| 532 |
+
|
| 533 |
+
print(f"\n5. 测试 RSCU 计算")
|
| 534 |
+
print("-" * 50)
|
| 535 |
+
|
| 536 |
+
# 测试RSCU计算
|
| 537 |
+
rscu_results = codon_instance_rna[test_species].calculate_RSCU(test_sequences)
|
| 538 |
+
print(f"计算了 {len(rscu_results)} 个密码子的RSCU值")
|
| 539 |
+
print("前10个密码子的RSCU值:")
|
| 540 |
+
for i, (codon, rscu) in enumerate(list(rscu_results.items())[:10]):
|
| 541 |
+
print(f" {codon}: {rscu:.3f}")
|
| 542 |
+
|
| 543 |
+
print(f"\n6. 测试综合分析 (analyze_sequence)")
|
| 544 |
+
print("-" * 50)
|
| 545 |
+
|
| 546 |
+
# 测试综合分析
|
| 547 |
+
analysis_result = codon_instance_rna[test_species].analyze_sequence(test_sequence, "Test_Gene")
|
| 548 |
+
print("综合分析结果:")
|
| 549 |
+
for key, value in analysis_result.items():
|
| 550 |
+
print(f" {key}: {value}")
|
| 551 |
+
|
| 552 |
+
print(f"\n7. 测试序列验证功能")
|
| 553 |
+
print("-" * 50)
|
| 554 |
+
|
| 555 |
+
# 测试无效序列
|
| 556 |
+
invalid_seqs = [
|
| 557 |
+
"AUGGCUUCUUUUCUCG", # 长度不是3的倍数
|
| 558 |
+
"AUGXXXUUUUCUCGUAUACACAGAUGACUACGUAGCAGCUACGUACGUACGUACG", # 包含无效字符
|
| 559 |
]
|
| 560 |
|
| 561 |
+
for i, seq in enumerate(invalid_seqs):
|
| 562 |
+
try:
|
| 563 |
+
codon_instance_rna[test_species]._validate_sequence(seq)
|
| 564 |
+
print(f"序列 {i + 1}: 错误地通过了验证")
|
| 565 |
+
except ValueError as e:
|
| 566 |
+
print(f"序列 {i + 1}: 正确捕获错误 - {e}")
|
| 567 |
+
|
| 568 |
+
print(f"\n8. 测试密码子计数")
|
| 569 |
+
print("-" * 50)
|
| 570 |
+
|
| 571 |
+
# 测试密码子计数
|
| 572 |
+
codon_count = codon_instance_rna[test_species]._count_codons(test_sequence)
|
| 573 |
+
print(f"序列 '{test_sequence[:20]}...' 的密码子计数:")
|
| 574 |
+
for codon, count in list(codon_count.items())[:5]:
|
| 575 |
+
print(f" {codon}: {count}")
|
| 576 |
+
print(f" ... (共 {len(codon_count)} 种密码子)")
|
| 577 |
+
|
| 578 |
+
print(f"\n9. 测试不同输出格式的兼容性")
|
| 579 |
+
print("-" * 50)
|
| 580 |
+
|
| 581 |
+
# 验证RNA和DNA输出实例的CAI计算相同
|
| 582 |
+
cai_rna_instance = codon_instance_rna[test_species].calc_cai(test_sequence)
|
| 583 |
+
cai_dna_instance = codon_instance_dna[test_species].calc_cai(test_sequence)
|
| 584 |
+
|
| 585 |
+
print(f"RNA输出实例CAI: {cai_rna_instance:.4f}")
|
| 586 |
+
print(f"DNA输出实例CAI: {cai_dna_instance:.4f}")
|
| 587 |
+
print(f"✓ 不同输出格式实例的CAI计算相同: {np.isclose(cai_rna_instance, cai_dna_instance)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 588 |
|
| 589 |
+
print(f"\n" + "=" * 60)
|
| 590 |
+
print("所有功能测试完成!")
|
| 591 |
+
print("=" * 60)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 592 |
|
| 593 |
|
|
|
|
| 594 |
if __name__ == "__main__":
|
| 595 |
+
example_usage()
|
|
|