Spaces:

julse
/

maotao

Running

App Files Files Community

julse commited on Dec 24, 2025

Commit

a0727ad

verified ·

1 Parent(s): 34da6e2

Update model/codon_attr.py

Browse files

Files changed (1) hide show

model/codon_attr.py +582 -371

model/codon_attr.py CHANGED Viewed

@@ -1,384 +1,595 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-"""
-Title     : drawRNA.py
-project   : web
-Created by: julse
-Created on: 2025/7/4 14:24
-des: TODO
-"""
-import sys
 import os
-import time
 import pandas as pd
 import numpy as np
-import gradio as gr
-import pandas as pd
-import numpy as np
-import os
-import tempfile
-import subprocess
-from PIL import Image
-# 定义颜色样式 - 固定使用一组预定义颜色
-COLORS = [
-    '#FF0000',  # 红色 - UTR5
-    '#0000FF',  # 蓝色 - CDS起始区
-    '#FFC0CB',  # 粉色 - CDS终止区
-    '#FFA500',  # 橙色 - UTR3
-    '#FFFF00',  # 黄色 - 起始密码子
-    '#800080'  # 紫色 - 终止密码子
-]
-COLOR_MAP = {
-    'UTR5': '#FF0000',        # 红色
-    'CDS_start': '#0000FF',   # 蓝色 - CDS起始区
-    'CDS_mid': '#00FF00',     # 绿色 - CDS中间区（添加的）
-    'CDS_end': '#FFC0CB',     # 粉色 - CDS终止区
-    'UTR3': '#FFA500',        # 橙色
-    'start_codon': '#FFFF00', # 黄色 - 起始密码子
-    'stop_codon': '#800080',  # 紫色 - 终止密码子
-    'intron': '#A9A9A9',      # 灰色 - 内含子（添加的）
-    'exon': '#90EE90',        # 浅绿色 - 外显子（添加的）
-}
-def get_bases_index(utr5, cds, utr3):
-    """计算各区域的位置索引"""
-    start_codon_idx = len(utr5)
-    stop_codon_idx = len(utr5) + len(cds)
-    # UTR5区域
-    utr5_start = max(0, start_codon_idx - 300)
-    utr5_range = list(range(utr5_start + 1, start_codon_idx + 1))
-    # CDS起始区（不包括起始密码子）
-    cds_start = start_codon_idx + 3
-    cds_end = min(start_codon_idx + 300, stop_codon_idx - 3)
-    start_codon_range = list(range(cds_start + 1, cds_end + 1))
-    # CDS终止区（不包括终止密码子）
-    cds_start = max(start_codon_idx, stop_codon_idx - 300)
-    stop_codon_range = list(range(cds_start + 1, stop_codon_idx - 2))
-    # UTR3区域
-    utr3_range = list(range(stop_codon_idx + 1, min(stop_codon_idx + 301, stop_codon_idx + len(utr3) + 1)))
-    # 起始密码子 (3个碱基)
-    start_codon = list(range(start_codon_idx + 1, start_codon_idx + 4))
-    # 终止密码子 (3个碱基)
-    stop_codon = list(range(stop_codon_idx - 2, stop_codon_idx + 1))
-    # 转换为逗号分隔的字符串
-    return (
-        ",".join(map(str, utr5_range)),
-        ",".join(map(str, start_codon_range)),
-        ",".join(map(str, stop_codon_range)),
-        ",".join(map(str, utr3_range)),
-        ",".join(map(str, start_codon)),
-        ",".join(map(str, stop_codon))
-    )
-def calc_mfe(seq):
-    import RNA
-    fc = RNA.fold_compound(seq)
-    ss, mfe = fc.mfe()
-    return ss, mfe
-def dbn_to_tuple(dbn, c1_region=[], c2_region=[]):
-    # 构建配对字典
-    stack, pairs = [], {}
-    for i, char in enumerate(dbn):
-        if char == '(':
-            stack.append(i)
-        elif char == ')':
-            j = stack.pop()
-            if len(c1_region) == 0 or len(c2_region) == 0:
-                pairs[i + 1] = j + 1
             else:
-                if i + 1 in c2_region and j + 1 in c1_region:
-                    pairs[i + 1] = j + 1
-    return pairs
-def run_cmd(command, output_file):
-    # 执行命令
-    result = subprocess.run(command, capture_output=True, text=True)
-    # 检查是否执行成功
-    if result.returncode != 0:
-        error_msg = f"执行VARNA命令时出错:\n{result.stderr}"
-        os.unlink(output_file)  # 删除临时文件
-        raise RuntimeError(error_msg)
-    # 检查文件是否成功创建
-    if not os.path.exists(output_file):
-        raise FileNotFoundError("未能生成结构图文件")
-def run_draw_rna_advanced(full_sequence, structure, utr5_range, start_codon_range,
-                          stop_codon_range, utr3_range, start_codon, stop_codon,
-                          focus_region, auxBPs, output_file,algorithm, title=''):
-    import matplotlib.pyplot as plt
-    from draw_rna.ipynb_draw import draw_struct
-    # 解析输入
-    utr5_range = eval(utr5_range)
-    start_codon_range = eval(start_codon_range)
-    stop_codon_range = eval(stop_codon_range)
-    utr3_range = eval(utr3_range)
-    start_codon = eval(start_codon)
-    stop_codon = eval(stop_codon)
-    # 定义颜色方
-    # 颜色映射
-    COLOR_MAP = {
-        'UTR5': '#FF0000',  # 红色
-        'CDS_start': '#0000FF',  # 蓝色 - CDS起始区
-        'CDS_end': '#FFC0CB',  # 粉色 - CDS终止区
-        'UTR3': '#FFA500',  # 橙色
-        'start_codon': '#FFFF00',  # 黄色 - 起始密码子
-        'stop_codon': '#800080',  # 紫色 - 终止密码子
-        'default': '#808080'  # 灰色
-    }
-    # 区域到数值的映射
-    region_to_value = {
-        'default':0,
-        'UTR5': 1,
-        'CDS_start': 2,
-        'CDS_end': 3,
-        'UTR3': 4,
-        'start_codon': 5,
-        'stop_codon': 6
-    }
-    # 自定义colormap
-    from matplotlib.colors import ListedColormap
-    # 创建自定义颜色列表，按照数值顺序
-    custom_colors = [
-        COLOR_MAP['default'],  # 0: 灰色
-        COLOR_MAP['UTR5'],  # 1: 红色
-        COLOR_MAP['CDS_start'],  # 2: 蓝色
-        COLOR_MAP['CDS_end'],  # 3: 粉色
-        COLOR_MAP['UTR3'],  # 4: 橙色
-        COLOR_MAP['start_codon'],  # 5: 黄色
-        COLOR_MAP['stop_codon']  # 6: 紫色
     ]
-    custom_cmap = ListedColormap(custom_colors)
-    # 创建数值数组，每个数值对应一种颜色
-    colors = [region_to_value['default']]*len(full_sequence)
-    for i in utr5_range:
-        colors[i-1]= region_to_value['UTR5']
-    for i in utr3_range:
-        colors[i-1] = region_to_value['UTR3']
-    for i in start_codon_range:
-        colors[i-1] = region_to_value['CDS_start']
-    for i in stop_codon_range:
-        colors[i-1] = region_to_value['CDS_end']
-    for i in start_codon:
-        colors[i-1] = region_to_value['start_codon']
-    for i in stop_codon:
-        colors[i-1] = region_to_value['stop_codon']
-    draw_struct(full_sequence, structure,
-        c = colors,
-        cmap = custom_cmap,
-        vmin = 0,
-        vmax = 6,
-                line=algorithm,
-                )
-    # 添加图例
-    color_scheme = COLOR_MAP
-    legend_elements = [
-        plt.Rectangle((0, 0), 1, 1, facecolor=color_scheme['UTR5'], edgecolor='black', label="5'UTR"),
-        plt.Rectangle((0, 0), 1, 1, facecolor=color_scheme['CDS_start'], edgecolor='black', label="CDS Start"),
-        plt.Rectangle((0, 0), 1, 1, facecolor=color_scheme['CDS_end'], edgecolor='black', label="CDS End"),
-        plt.Rectangle((0, 0), 1, 1, facecolor=color_scheme['UTR3'], edgecolor='black', label="3'UTR"),
-        plt.Rectangle((0, 0), 1, 1, facecolor=color_scheme['start_codon'], edgecolor='black', label="Start Codon"),
-        plt.Rectangle((0, 0), 1, 1, facecolor=color_scheme['stop_codon'], edgecolor='black', label="Stop Codon"),
     ]
-    plt.legend(handles=legend_elements, loc='upper left', bbox_to_anchor=(1.05, 1), fontsize=10)
-    # 调整布局并保存
-    plt.savefig(output_file, dpi=300, bbox_inches='tight')
-    plt.close()
-    print(f"Successfully created: {output_file}")
-def draw_simple(utr5_seq, title=''):
-    img_paths = []
-    stru5, mfe = calc_mfe(utr5_seq)
-    import matplotlib.pyplot as plt
-    from draw_rna.ipynb_draw import draw_struct
-    draw_struct(utr5_seq, stru5)
-    # 创建临时文件
-    with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmpfile:
-        output_file = tmpfile.name
-        img_paths.append(output_file)
-    plt.title(title)
-    # 保存当前活动的图形
-    plt.savefig(output_file,
-                dpi=300,
-                bbox_inches='tight',
-                facecolor='white',
-                edgecolor='none')
-    return img_paths, mfe, stru5
-def generate_rna_structure(utr5_seq, cds_seq, utr3_seq, structure, draw_2d=["mRNA"]):
-    """生成RNA结构图"""
-    message = ""
-    # 组合完整序列
-    full_sequence = utr5_seq + cds_seq + utr3_seq
-    mfe = None
-    img_paths = []
-    if "Full mRNA" in draw_2d:
-        if structure == "":
-            structure, mfe = calc_mfe(full_sequence)
-        # 验证序列和结构长度匹配
-        if len(full_sequence) != len(structure):
-            return f"序列长度({len(full_sequence)})与结构长度({len(structure)})不匹配"
-        '''full mRNA'''
-        # 获取各区域位置
-        utr5_range, start_codon_range, stop_codon_range, utr3_range, start_codon, stop_codon = get_bases_index(
-            utr5_seq, cds_seq, utr3_seq
-        )
-        focus_region = f'{min(eval(utr5_range))}-{max(eval(start_codon_range))}:fill=#bcffdd;{min(eval(stop_codon_range))}-{max(eval(utr3_range))}:fill=#bcffdd'
-        pairs = dbn_to_tuple(structure, c1_region=eval(','.join([utr5_range, start_codon, start_codon_range])),
-                             c2_region=eval(','.join([stop_codon_range, utr3_range, stop_codon])))
-        auxBPs = ';'.join([f'({key},{value}):color=#6ed86e' for key, value in pairs.items()])
-        for algorithm in ["line", "naview"]:
-            # 创建临时文件
-            with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmpfile:
-                output_file = tmpfile.name
-                img_paths.append((output_file,f'mRNA_{algorithm}'))
-            # algorithm = "line"  # 线条算法
-            # 构建VARNA命令, write to file
-            # run_VARNA(full_sequence, structure, utr5_range, start_codon_range, stop_codon_range, utr3_range,
-            #           start_codon, stop_codon, focus_region, auxBPs, output_file, algorithm, title='mRNA')
-            #
-            algorithm = algorithm=="line"  # 线条算法
-            # 构建VARNA命令, write to file
-            run_draw_rna_advanced(full_sequence, structure, utr5_range, start_codon_range, stop_codon_range, utr3_range,
-                      start_codon, stop_codon, focus_region, auxBPs, output_file,algorithm, title='mRNA')
-    if "5'leader (30 nt)" in draw_2d:
-        img_path, local_mfe, stru5 = draw_simple(full_sequence[:30], title="5'leader (30 nt))")
-        img_paths.extend((img_path,'head_30'))
-        message += f"\nhead(30nt) MFE={local_mfe:.2f} kcal/mol"
-    if "5'UTR" in draw_2d:
-        img_path, local_mfe, stru5 = draw_simple(utr5_seq, title="5'UTR")
-        img_paths.extend((img_path,'utr5'))
-        message += f"\n5'UTR MFE={local_mfe:.2f} kcal/mol"
-    if "CDS" in draw_2d:
-        img_path, local_mfe, stru5 = draw_simple(cds_seq, title="CDS")
-        img_paths.extend((img_path,'cds'))
-        message += f"\nCDS MFE={local_mfe:.2f} kcal/mol"
-    if "3'UTR" in draw_2d:
-        img_path, local_mfe, stru5 = draw_simple(utr3_seq, title="3'UTR")
-        img_paths.extend((img_path,'utr3'))
-        message += f"\n3'UTR MFE={local_mfe:.2f} kcal/mol"
-    return img_paths, mfe, structure, message
-def visualize_rna(utr5_seq, cds_seq, utr3_seq, structure):
-    """可视化RNA结构的主函数"""
-    # 生成RNA结构图
-    image_path, mfe, structure, message = generate_rna_structure(utr5_seq, cds_seq, utr3_seq, structure)
-    mfe = f'MFE={mfe:.2f} kcal/mol' if mfe else None
-    # 返回图像
-    return image_path, mfe, structure, message
-def draw_rna_2d():
-    # 创建Gradio界面
-    with gr.Blocks(title="RNA结构可视化") as demo:
-        gr.Markdown("# RNA结构可视化工具")
-        gr.Markdown("使用VARNA可视化RNA二级结构，并高亮显示不同区域")
-        with gr.Row():
-            with gr.Column(scale=1):
-                utr5_seq = gr.Textbox(label="5'UTR序列", value="AUGCCAUGAACAGCUAC", placeholder="输入5'UTR序列...")
-                cds_seq = gr.Textbox(label="CDS序列", value="AUGCCAUGAACAGCUAC", placeholder="输入CDS序列...")
-                utr3_seq = gr.Textbox(label="3'UTR序列", value="AUGCCAUGAACAGCUAC", placeholder="输入3'UTR序列...")
-                structure = gr.Textbox(
-                    label="二级结构",
-                    value="...........((((.((((.((((........)))).))))...))))..",
-                    placeholder="输入点括号表示的二级结构..."
-                )
-                submit_btn = gr.Button("生成结构图", variant="primary")
-            with gr.Column():
-                # output_image = gr.Image(label="RNA结构图", interactive=False)
-                output_image = gr.Gallery(label="RNA结构图", interactive=False, object_fit="contain")
-                mfe = gr.Markdown(label="MFE", value="")
-                message = gr.Markdown(label="Message", value="")
-        # 颜色图例
-        with gr.Accordion("颜色说明", open=False):
-            gr.Markdown("""
-            | 颜色 | 区域 |
-            |------|------|
-            | <span style="color:red">■</span> 红色 | 5'UTR 区域 |
-            | <span style="color:blue">■</span> 蓝色 | CDS起始区域 |
-            | <span style="color:#FFC0CB">■</span> 粉色 | CDS终止区域 |
-            | <span style="color:orange">■</span> 橙色 | 3'UTR 区域 |
-            | <span style="color:yellow">■</span> 黄色 | 起始密码子 (AUG) |
-            | <span style="color:purple">■</span> 紫色 | 终止密码子 (UAA, UAG, UGA) |
-            """)
-        # 示例数据
-        with gr.Accordion("示例数据", open=False):
-            gr.Examples(
-                examples=[
-                    [
-                        "AUGCCAUGAACAGCUAC",
-                        "AUGCCAUGAACAGCUAC",
-                        "AUGCCAUGAACAGCUAC",
-                        "...........((((.((((.((((........)))).))))...)))).."
-                    ],
-                    [
-                        "GGGAAAUUUCCC",
-                        "AUGCCAUGAACAGCUAC",
-                        "UUUAAAGGGCCC",
-                        "((((....))))..(((.((((.......))))...))).."
-                    ]
-                ],
-                inputs=[utr5_seq, cds_seq, utr3_seq, structure]
-            )
-        # 提交处理
-        submit_btn.click(
-            visualize_rna,
-            inputs=[utr5_seq, cds_seq, utr3_seq, structure],
-            outputs=[output_image, mfe, structure, message]
-        )
-    return demo
-# 运行应用
 if __name__ == "__main__":
-    demo = draw_rna_2d()
-    demo.launch(server_port=8080, debug=True)

 import os
+import random
 import pandas as pd
 import numpy as np
+from typing import List, Dict, Tuple, Union
+from collections import defaultdict
+class Codon:
+    CODON_TO_AA = {
+            'UUU': 'F', 'UUC': 'F',  # Phe (2-fold)
+            'UUA': 'L', 'UUG': 'L', 'CUU': 'L', 'CUC': 'L', 'CUA': 'L', 'CUG': 'L',  # Leu (6-fold)
+            'AUU': 'I', 'AUC': 'I', 'AUA': 'I',  # Ile (3-fold)
+            'AUG': 'M',  # Met (无同义密码子，排除)
+            'GUU': 'V', 'GUC': 'V', 'GUA': 'V', 'GUG': 'V',  # Val (4-fold)
+            'UCU': 'S', 'UCC': 'S', 'UCA': 'S', 'UCG': 'S', 'AGU': 'S', 'AGC': 'S',  # Ser (6-fold)
+            'CCU': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',  # Pro (4-fold)
+            'ACU': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',  # Thr (4-fold)
+            'GCU': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',  # Ala (4-fold)
+            'UAU': 'Y', 'UAC': 'Y',  # Tyr (2-fold)
+            'UAA': '*', 'UAG': '*', 'UGA': '*',  # 终止密码子 (排除)
+            'CAU': 'H', 'CAC': 'H',  # His (2-fold)
+            'CAA': 'Q', 'CAG': 'Q',  # Gln (2-fold)
+            'AAU': 'N', 'AAC': 'N',  # Asn (2-fold)
+            'AAA': 'K', 'AAG': 'K',  # Lys (2-fold)
+            'GAU': 'D', 'GAC': 'D',  # Asp (2-fold)
+            'GAA': 'E', 'GAG': 'E',  # Glu (2-fold)
+            'UGU': 'C', 'UGC': 'C',  # Cys (2-fold)
+            'UGG': 'W',  # Trp (无同义密码子，排除)
+            'CGU': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 'AGA': 'R', 'AGG': 'R',  # Arg (6-fold)
+            'GGU': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'  # Gly (4-fold)
+        }
+    def __init__(self, codon_usage_path, rna=True):
+        self.bases = 'GAUC'
+        self.aas = 'ACDEFGHIKLMNPQRSTVWY*'.lower()
+        self.codon_table = {}
+        self.max_aa_table = {}
+        self.cai_best_aa2nn_table = {}
+        self.frame_ith_aa_base_fraction = {
+            i: {
+                a: {
+                    base: 0.0 for base in self.bases
+                } for a in self.aas
+            } for i in range(3)
+        }
+        # 1: {'A': {'A': 0.0, 'C': 0.0, 'G': 0.0, 'U': 0.0},
+        #   'C': {'A': 0.0, 'C': 0.0, 'G': 0.0, 'U': 0.0},
+        #   'G': {'A': 0.0, 'C': 0.0, 'G': 0.0, 'U': 0.0},
+        #   'U': {'A': 0.0, 'C': 0.0, 'G': 0.0, 'U': 0.0}},
+        # self.frame_ith_aa_base_fraction = {0: defaultdict(list), 1: defaultdict(list), 2: defaultdict(list)}
+        # self.frame_ith_aa_base_fraction = {i:{a:{base:defaultdict(float)} for a in self.aas for base in self.bases} for i in range(3)}
+        # rna参数现在只用于控制输出格式，输入可以是RNA或DNA
+        self.output_rna = rna
+        # RNA标准密码子表（用于ENC和RSCU计算）
+        self.standard_codon_table = self.CODON_TO_AA
+        # 按简并度预分组氨基酸
+        self.degeneracy_groups = {
+            '2-fold': ['F', 'Y', 'C', 'H', 'Q', 'N', 'K', 'D', 'E'],
+            '3-fold': ['I'],
+            '4-fold': ['V', 'P', 'T', 'A', 'G'],
+            '6-fold': ['L', 'S', 'R']
+        }
+        # print(f"\nOutput format: {'RNA' if self.output_rna else 'DNA'}")
+        # print(f"Loading codon usage table from {codon_usage_path}")
+        # print("suppose csv in the format columns: 'codon', 'amino_acid', 'fraction'\n")
+        if os.access(codon_usage_path, os.R_OK) and os.path.getsize(codon_usage_path) > 0:
+            with open(codon_usage_path, 'r') as codon_file:
+                next(codon_file)  # Skip the header line
+                for line in codon_file:
+                    line = line.strip()
+                    if not line:
+                        continue
+                    codon, aa, fraction, *_ = line.split(',')
+                    # 内部统一存储为RNA格式, AA 小写
+                    codon = codon.upper().replace('T', 'U')
+                    aa = aa.lower()
+                    fraction = float(fraction)
+                    self.codon_table[codon] = (aa, fraction)
+                    for i,base in enumerate(codon):
+                        # print(i,aa,base,fraction,self.frame_ith_aa_base_fraction[i][aa])
+                        # self.frame_ith_table[i][aa].append((base, fraction))
+                        self.frame_ith_aa_base_fraction[i][aa][base] = fraction + self.frame_ith_aa_base_fraction[i][aa][base]
+                        # self.frame_ith_table[i][aa][base] = fraction + self.frame_ith_table[i][aa][base]
+                    if aa not in self.max_aa_table or self.max_aa_table[aa] < fraction:
+                        self.max_aa_table[aa] = fraction
+                        self.cai_best_aa2nn_table[aa] = codon
+            # frame_ith_table = [self.frame_ith_table[i][aa] for aa in self.frame_ith_table[i] for i in range(3)]
+            print(f"Codon usage table loaded, {len(self.codon_table)} codons loaded from {codon_usage_path}")
+        else:
+            print(f'codon usage table is missing',codon_usage_path)
+        self.aa_to_codons = self._build_aa_to_codons()
+        # 预计算氨基酸到密码子权重的映射（用于加权随机）
+        self.aa_to_weights = self._build_aa_to_weights()
+        self.calculate_CAI = self.calc_cai
+    def _build_aa_to_codons(self):
+        """构建氨基酸到密码子列表的映射"""
+        aa_to_codons = defaultdict(list)
+        for codon, (aa, _) in self.codon_table.items():
+            aa_to_codons[aa].append(codon)
+        return dict(aa_to_codons)
+    def _build_aa_to_weights(self):
+        """构建氨基酸到密码子权重的映射"""
+        aa_to_weights = defaultdict(list)
+        for codon, (aa, weight) in self.codon_table.items():
+            aa_to_weights[aa].append(weight)
+        return dict(aa_to_weights)
+    def _normalize_sequence(self, sequence: str) -> str:
+        """标准化序列为RNA格式"""
+        sequence = sequence.upper()
+        # 将DNA转换为RNA格式（内部统一使用RNA）
+        sequence = sequence.replace('T', 'U')
+        return sequence
+    def _validate_sequence(self, sequence: str) -> str:
+        """验证并标准化序列"""
+        sequence = self._normalize_sequence(sequence)
+        if len(sequence) % 3 != 0:
+            raise ValueError(f"序列长度必须是3的倍数，当前长度: {len(sequence)}")
+        valid_bases = {'A', 'U', 'C', 'G'}
+        if not all(base in valid_bases for base in sequence):
+            raise ValueError("序列包含无效的碱基字符")
+        return sequence
+    def _count_codons(self, sequence: str) -> Dict[str, int]:
+        """统计序列中密码子使用次数"""
+        sequence = self._validate_sequence(sequence)
+        codon_count = {}
+        num_codons = len(sequence) // 3
+        for i in range(num_codons):
+            codon = sequence[i * 3:(i + 1) * 3]
+            if codon in self.standard_codon_table and self.standard_codon_table[codon] != '*':
+                codon_count[codon] = codon_count.get(codon, 0) + 1
+        return codon_count
+    @staticmethod
+    def translate_sequence(sequence: str) -> str:
+        """将序列翻译为氨基酸序列"""
+        sequence = sequence.upper().replace('T', 'U')
+        aa_seq = ''
+        for i in range(0, len(sequence), 3):
+            codon = sequence[i:i + 3]
+            if codon in Codon.CODON_TO_AA:
+                aa = Codon.CODON_TO_AA[codon]
+                aa_seq += aa
+        return aa_seq
+    def calc_cai(self, seq):
+        """计算CAI值，输入可以是RNA或DNA序列"""
+        # 标准化序列为RNA格式
+        seq = self._normalize_sequence(seq)
+        if len(seq) % 3 != 0:
+            # raise ValueError(f"序列长度必须是3的倍数, 当前长度: {len(seq)},{seq}")
+            return np.nan
+        cai = 0.0
+        valid_num = 0
+        for i in range(0, len(seq), 3):
+            codon = seq[i:i + 3]
+            if codon not in self.codon_table:
+                continue
+            aa, fraction = self.codon_table[codon]
+            f_c_max = self.max_aa_table[aa]
+            w_i = fraction / f_c_max
+            cai += np.log2(w_i)
+            valid_num += 1
+        return np.exp2(cai / valid_num) if valid_num > 0 else 0.0
+    def cai_opt_codon(self, aa_seq):
+        aa_seq = aa_seq.lower()
+        """获取CAI最优密码子序列"""
+        cai_opt_codon = []
+        for i in range(0, len(aa_seq), 1):
+            aa = aa_seq[i]
+            codon = self.cai_best_aa2nn_table.get(aa, '___')
+            # 根据输出格式转换
+            if not self.output_rna:
+                codon = codon.replace('U', 'T')
+            cai_opt_codon.append(codon)
+        return ''.join(cai_opt_codon)
+    def random_codon(self, aa_seq):
+        """
+        根据密码子频率加权随机生成CDS序列
+        参数:
+            aa_sequence (str): 氨基酸序列（单字母）
+        返回:
+            str: 随机生成的DNA序列
+        """
+        aa_seq = aa_seq.lower()
+        opt_codon = []
+        for i in range(0, len(aa_seq), 1):
+            aa = aa_seq[i]
+            if aa not in self.aa_to_codons:
+                codon = '___'
+            else:
+                codons = self.aa_to_codons[aa] # ['AUG']
+                weights = self.aa_to_weights[aa] # [1.0]
+                codon = random.choices(codons, weights=weights, k=1)[0]
+            opt_codon.append(codon)
+        opt_nn = ''.join(opt_codon)
+        # 根据输出格式转换
+        if not self.output_rna:
+            opt_nn = opt_nn.replace('U', 'T')
+        return opt_nn
+    def random_codon_weight(self, aa_seq,weights_df=None):
+        """
+        根据密码子频率加权随机生成CDS序列
+        参数:
+            aa_sequence (str): 氨基酸序列（单字母）
+        返回:
+            str: 随机生成的DNA序列
+        """
+        if weights_df is None:
+            return self.random_codon(aa_seq)
+        # weights_df.columns = ['triplet', 'amino_acid', 'fraction']
+        # weights_df_gp = weights_df.groupby(by='amino_acid')
+        aa_seq = aa_seq.lower()
+        opt_codon = []
+        for i in range(0, len(aa_seq), 1):
+            aa = aa_seq[i]
+            if aa not in self.aa_to_codons:
+                codon = '___'
             else:
+                tmp = weights_df[weights_df['amino_acid']==aa]
+                codon = random.choices(tmp['triplet'].to_list(), weights=tmp['fraction'].to_list(), k=1)[0]
+            opt_codon.append(codon)
+        opt_nn = ''.join(opt_codon)
+        # 根据输出格式转换
+        if not self.output_rna:
+            opt_nn = opt_nn.replace('U', 'T')
+        return opt_nn
+    def calculate_ENC(self, sequence: str) -> float:
+        """
+        计算单条序列的ENC值，输入可以是RNA或DNA序列
+        参数:
+            sequence: 序列字符串
+        返回:
+            enc_value: ENC值
+        """
+        codon_count = self._count_codons(sequence)
+        # 按氨基酸分组
+        amino_acid_counts = {}
+        for codon, aa in self.standard_codon_table.items():
+            if aa in ['M', 'W'] or aa == '*':
+                continue
+            if aa not in amino_acid_counts:
+                amino_acid_counts[aa] = {}
+            amino_acid_counts[aa][codon] = codon_count.get(codon, 0)
+        # 计算每个氨基酸组的F值
+        F_values = {'2-fold': [], '3-fold': [], '4-fold': [], '6-fold': []}
+        for aa, codon_counts in amino_acid_counts.items():
+            # 确定简并度
+            degeneracy = None
+            for deg, aas in self.degeneracy_groups.items():
+                if aa in aas:
+                    degeneracy = deg
+                    break
+            if not degeneracy:
+                continue
+            # 获取该氨基酸的所有同义密码子
+            codons_for_aa = [c for c, a in self.standard_codon_table.items()
+                             if a == aa and a not in ['M', 'W'] and a != '*']
+            s = len(codons_for_aa)
+            # 统计使用次数
+            n_i_values = [codon_counts.get(codon, 0) for codon in codons_for_aa]
+            total_n = sum(n_i_values)
+            if total_n == 0 or s <= 1:
+                continue
+            # 计算F值
+            sum_squared_freq = sum((n_i / total_n) ** 2 for n_i in n_i_values)
+            F = (s * sum_squared_freq - 1) / (s - 1)
+            F_values[degeneracy].append(F)
+        # 计算各简并度的平均F值
+        # F2_avg = np.mean(F_values['2-fold']) if F_values['2-fold'] else 1.0
+        # F3_avg = np.mean(F_values['3-fold']) if F_values['3-fold'] else 1.0
+        # F4_avg = np.mean(F_values['4-fold']) if F_values['4-fold'] else 1.0
+        # F6_avg = np.mean(F_values['6-fold']) if F_values['6-fold'] else 1.0
+        enc_value = 2.0
+        if F_values['2-fold']:
+            enc_value += 9.0 / np.mean(F_values['2-fold'])
+        if F_values['3-fold']:
+            enc_value += 1.0 / np.mean(F_values['3-fold'])
+        if F_values['4-fold']:
+            enc_value += 5.0 / np.mean(F_values['4-fold'])
+        if F_values['6-fold']:
+            enc_value += 3.0 / np.mean(F_values['6-fold'])
+        # 计算ENC值
+        # enc_value = 2 + 9 / F2_avg + 1 / F3_avg + 5 / F4_avg + 3 / F6_avg
+        return enc_value
+    def calculate_RSCU(self, sequences: List[str]) -> Dict[str, float]:
+        """
+        计算相对同义密码子使用度 (Relative Synonymous Codon Usage, RSCU)
+        参数:
+            sequences: 序列列表（可以是RNA或DNA）
+        返回:
+            rscu_dict: 每个密码子的RSCU值字典（RNA格式）
+        """
+        total_codon_count = defaultdict(int)
+        aa_observed_codons = defaultdict(set)
+        # 统计所有序列的密码子使用
+        for seq in sequences:
+            try:
+                codon_count = self._count_codons(seq)
+                for codon, count in codon_count.items():
+                    aa = self.standard_codon_table[codon]
+                    total_codon_count[codon] += count
+                    aa_observed_codons[aa].add(codon)
+            except ValueError:
+                continue  # 跳过无效序列
+        # 计算RSCU
+        rscu_dict = {}
+        aa_total_count = defaultdict(int)
+        # 首先计算每个氨基酸的总密码子数
+        for codon, count in total_codon_count.items():
+            aa = self.standard_codon_table[codon]
+            aa_total_count[aa] += count
+        # 然后计算每个密码子的RSCU
+        for codon, count in total_codon_count.items():
+            aa = self.standard_codon_table[codon]
+            if aa_total_count[aa] > 0:
+                # 该氨基酸的同义密码子数量
+                synonymous_codons = len([c for c in aa_observed_codons[aa]
+                                         if self.standard_codon_table[c] == aa])
+                expected_count = aa_total_count[aa] / synonymous_codons
+                rscu_dict[codon] = count / expected_count if expected_count > 0 else 0.0
+            else:
+                rscu_dict[codon] = 0.0
+        return rscu_dict
+    def analyze_sequence(self, sequence: str, sequence_name: str = "") -> Dict:
+        """
+        综合分析单条序列的密码子使用特征
+        参数:
+            sequence: 序列字符串（可以是RNA或DNA）
+            sequence_name: 序列名称（可选）
+        返回:
+            包含所有指标的字典
+        """
+        try:
+            enc = self.calculate_ENC(sequence)
+            cai = self.calc_cai(sequence)
+            result = {
+                'Sequence_Name': sequence_name,
+                'Sequence_Length': len(sequence),
+                'ENC': round(enc, 3),
+                'ENC_Preference': 'strong' if enc <= 35 else 'week',
+                'CAI': round(cai, 3),
+                'CAI_Level': 'high' if cai > 0.7 else 'low'
+            }
+            return result
+        except Exception as e:
+            return {
+                'Sequence_Name': sequence_name,
+                'Sequence_Length': len(sequence),
+                'ENC': None,
+                'CAI': None,
+                'Error': str(e)
+            }
+    @staticmethod
+    def modify_func(sequence):
+        return '_'*len(sequence)
+    @staticmethod
+    def modify_codon_by_frames(sequence, frames=[1,2,3], modify_func=None):
+        """
+        高级版本：支持自定义修改函数
+        参数:
+            sequence (str): 输入序列
+            frame (int): 要修改的密码子位置 (1, 2, 3)
+            modify_func (callable): 修改函数，接收原帧字符串，返回修改后的字符串
+        返回:
+            str: 修改后的重建序列
+        """
+        # 清理序列
+        seq = sequence.upper().replace(' ', '').replace('\n', '')
+        seq = seq[:len(seq) - len(seq) % 3]
+        # 使用切片提取帧
+        frames = [seq[0::3], seq[1::3], seq[2::3]]
+        reconstructed_list =[]
+        # 应用修改函数
+        for frame in frames:
+            frame_index = frame - 1
+            if modify_func:
+                frames[frame_index] = modify_func(frames[frame_index])
+            # 重建序列
+            reconstructed = ''.join(
+                frames[0][i] + frames[1][i] + frames[2][i]
+                for i in range(len(frames[0]))
+            )
+            reconstructed_list.append(reconstructed)
+        return reconstructed_list
+# 使用示例 - 测试所有功能
+def example_usage():
+    """测试所有功能"""
+    print("=" * 60)
+    print("测试 Codon 类的所有功能")
+    print("=" * 60)
+    # 测试数据
+    species_list = ["mouse", "Ec", "Sac", "Pic", "Human"]
+    test_species = "mouse"  # 选择一个物种进行详细测试
+    # 测试序列
+    aa_seq = "MASV"
+    dna_seq = "ATGGCCATGGCGCCCAGAACTGAGATCAAATAGTACCCGTATTAACGGGTA"
+    rna_seq = dna_seq.replace('T', 'U')
+    # 测试序列集合（用于RSCU计算）
+    test_sequences = [
+        "AUGGCUUCUUUUUUCUUCUUCUUCUUCUUCUUCCUCCUCCUCCUCCUCCUCCUCCUC",  # RNA
+        "ATGGCUUCUUUUCUCGUAUACACAGATGACTACGTTAGCAGCTACGTTACGTTACGTTACG",  # DNA
+        "AUGGUUUGUUGGUUGGUUGGUUGGUUGGUUGGUUGGUUGGUUGGUUGGUUGGUUGGA"  # RNA
     ]
+    # 单个测试序列
+    test_sequence = "AUGGCUUCUUUUCUCGUAUACACAGAUGACUACGUAGCAGCUACGUACGUACGUACG"
+    Codon.translate_sequence(dna_seq)  # 验证translate_sequence函数
+    # 假设的密码子使用表路径
+    codon_table_path = "/Users/gz_julse/code/minimind_RiboUTR/maotao_file/codon_table/codon_usage_{species}.csv"
+    print(f"\n1. 初始化 Codon 实例 (物种: {species_list})")
+    print("-" * 50)
+    # 创建分析器实例，输出格式为RNA和DNA各一个
+    codon_instance_dna = {species: Codon(codon_table_path.format(species=species), rna=False) for species in
+                          species_list}
+    codon_instance_rna = {species: Codon(codon_table_path.format(species=species), rna=True) for species in
+                          species_list}
+    print(f"✓ 成功创建 {len(species_list)} 个物种的 Codon 实例")
+    print(f"\n2. 测试 CAI 计算")
+    print("-" * 50)
+    # 测试DNA和RNA序列输入
+    print("DNA序列CAI:", [codon_instance_rna[species].calc_cai(dna_seq) for species in species_list])
+    print("RNA序列CAI:", [codon_instance_rna[species].calc_cai(rna_seq) for species in species_list])
+    # 验证DNA和RNA输入结果一致
+    dna_cai = codon_instance_rna[test_species].calc_cai(dna_seq)
+    rna_cai = codon_instance_rna[test_species].calc_cai(rna_seq)
+    print(f"✓ DNA和RNA输入结果一致: {np.isclose(dna_cai, rna_cai)}")
+    print(f"\n3. 测试 CAI 最优密码子序列")
+    print("-" * 50)
+    # 测试最优密码子序列
+    opt_rna = codon_instance_rna[test_species].cai_opt_codon(aa_seq)
+    opt_dna = codon_instance_dna[test_species].cai_opt_codon(aa_seq)
+    print(f"氨基酸序列: {aa_seq}")
+    print(f"RNA格式最优密码子: {opt_rna}")
+    print(f"DNA格式最优密码子: {opt_dna}")
+    print(f"✓ 输出格式正确: RNA={opt_rna.replace('T', '') == opt_rna}, DNA={opt_dna.replace('U', '') == opt_dna}")
+    print(f"\n4. 测试 ENC 计算")
+    print("-" * 50)
+    # 测试ENC计算
+    enc_dna = codon_instance_rna[test_species].calculate_ENC(dna_seq)
+    enc_rna = codon_instance_rna[test_species].calculate_ENC(rna_seq)
+    print(f"DNA序列ENC: {enc_dna:.3f}")
+    print(f"RNA序列ENC: {enc_rna:.3f}")
+    print(f"✓ DNA和RNA输入结果一致: {np.isclose(enc_dna, enc_rna)}")
+    print(f"\n5. 测试 RSCU 计算")
+    print("-" * 50)
+    # 测试RSCU计算
+    rscu_results = codon_instance_rna[test_species].calculate_RSCU(test_sequences)
+    print(f"计算了 {len(rscu_results)} 个密码子的RSCU值")
+    print("前10个密码子的RSCU值:")
+    for i, (codon, rscu) in enumerate(list(rscu_results.items())[:10]):
+        print(f"  {codon}: {rscu:.3f}")
+    print(f"\n6. 测试综合分析 (analyze_sequence)")
+    print("-" * 50)
+    # 测试综合分析
+    analysis_result = codon_instance_rna[test_species].analyze_sequence(test_sequence, "Test_Gene")
+    print("综合分析结果:")
+    for key, value in analysis_result.items():
+        print(f"  {key}: {value}")
+    print(f"\n7. 测试序列验证功能")
+    print("-" * 50)
+    # 测试无效序列
+    invalid_seqs = [
+        "AUGGCUUCUUUUCUCG",  # 长度不是3的倍数
+        "AUGXXXUUUUCUCGUAUACACAGAUGACUACGUAGCAGCUACGUACGUACGUACG",  # 包含无效字符
     ]
+    for i, seq in enumerate(invalid_seqs):
+        try:
+            codon_instance_rna[test_species]._validate_sequence(seq)
+            print(f"序列 {i + 1}: 错误地通过了验证")
+        except ValueError as e:
+            print(f"序列 {i + 1}: 正确捕获错误 - {e}")
+    print(f"\n8. 测试密码子计数")
+    print("-" * 50)
+    # 测试密码子计数
+    codon_count = codon_instance_rna[test_species]._count_codons(test_sequence)
+    print(f"序列 '{test_sequence[:20]}...' 的密码子计数:")
+    for codon, count in list(codon_count.items())[:5]:
+        print(f"  {codon}: {count}")
+    print(f"  ... (共 {len(codon_count)} 种密码子)")
+    print(f"\n9. 测试不同输出格式的兼容性")
+    print("-" * 50)
+    # 验证RNA和DNA输出实例的CAI计算相同
+    cai_rna_instance = codon_instance_rna[test_species].calc_cai(test_sequence)
+    cai_dna_instance = codon_instance_dna[test_species].calc_cai(test_sequence)
+    print(f"RNA输出实例CAI: {cai_rna_instance:.4f}")
+    print(f"DNA输出实例CAI: {cai_dna_instance:.4f}")
+    print(f"✓ 不同输出格式实例的CAI计算相同: {np.isclose(cai_rna_instance, cai_dna_instance)}")
+    print(f"\n" + "=" * 60)
+    print("所有功能测试完成!")
+    print("=" * 60)
 if __name__ == "__main__":
+    example_usage()