from Bio import SeqIO
sequences = SeqIO.parse("D:/BaiduSyncdisk/20231009论文写作/投稿文件/trends_in_biotechnology/编辑意见返修/解脂耶氏酵母基因组注释+模型集生成+多尺度模型集+精炼+验证/1酵母基因组注释/Yarrowia_lipolytica_protein.faa", "fasta")
for seq_record in sequences:
    print(seq_record.id)
    print(seq_record.seq)

import pandas as pd
columns = ['qseqid','sseqid','pident','length','mismatch','gapopen','qstart','qend','sstart','send','evalue','bitscore']
df = pd.read_csv('D:/BaiduSyncdisk/20231009论文写作/投稿文件/trends_in_biotechnology/编辑意见返修/解脂耶氏酵母基因组注释+模型集生成+多尺度模型集+精炼+验证/1酵母基因组注释/out.tsv', sep='\t', names=columns)
df.sort_values(by=['sseqid','bitscore','pident'], inplace=True, ascending=[False, False, False])

gene_seq = list(df.sseqid)

li=gene_seq
new_li=[]
for i in li:
    if i not in new_li:
        new_li.append(i)

Res = [];
for i in new_li:
    a = df.query('sseqid == ' + '"' +i +'"')
    b = list(a.iloc[0,:])
    Res.append(b)

Res.to_excel("Yarrowia_lipolytica_model.xlsx", index=False)
Res = pd.DataFrame(Res, columns=columns)
Res.to_excel("Yarrowia_lipolytica_model_gene_relation.xlsx", index=False)

# 读取基因组GPR关系
data_fungi_gpr = pd.read_excel('D:/BaiduSyncdisk/20231009论文写作/投稿文件/trends_in_biotechnology/编辑意见返修/解脂耶氏酵母基因组注释+模型集生成+多尺度模型集+精炼+验证/1酵母基因组注释/new_model_fungi_grRules.xlsx', names = ['Rxn_name','GPR'])
data_fungi_gpr_ = list(data_fungi_gpr.GPR)
data_fungi_rxn_ = list(data_fungi_gpr.Rxn_name)

Res_gpr_rxn = []
for i in range(0, len(data_fungi_gpr_)):
    Res_gpr_rxn.append([data_fungi_rxn_[i],data_fungi_gpr_[i]])


# 先删除nan的行
Res_gpr_rxn_withoutnan = [i for i in Res_gpr_rxn if isinstance(i[1],str)]

# 搜索完全无法被注释上的GPR关系
Res_annotation_gene_list = [i[1] for i in Res]

for i in range(0, len(Res_gpr_rxn_withoutnan)):
    a = Res_gpr_rxn_withoutnan[i][1]
    b = a.split(' or ')
    for ii in range(0, len(b)):
        c = b[ii].split(' and ')
        for iii in range(0, len(c)):
            c[iii] = c[iii].replace('(','')
            c[iii] = c[iii].replace(')', '')
            if c[iii] not in Res_annotation_gene_list:
                b[ii] = 'delete'
                break
    new_gpr_str = ''
    for ii in range(0, len(b)):
        if b[ii] != 'delete':
            new_gpr_str = new_gpr_str +' or ' + b[ii]
    Res_gpr_rxn_withoutnan[i].append(new_gpr_str)   # 全部GPR关系都没有映射的情况下不进行下一步的打分环节

# 删除空白的GPR关系：
Res_gpr_rxn_withoutnan_without_emptygpr = [i for i in Res_gpr_rxn_withoutnan if i[2] != '']

# 删除开始的“ or ”
for i in range(0, len(Res_gpr_rxn_withoutnan_without_emptygpr)):
    if Res_gpr_rxn_withoutnan_without_emptygpr[i][2][0:4] == ' or ':
        Res_gpr_rxn_withoutnan_without_emptygpr[i][2] = Res_gpr_rxn_withoutnan_without_emptygpr[i][2][4:len(Res_gpr_rxn_withoutnan_without_emptygpr[i][2])]

# 替换基因 ID：
Res_annotation_gene_list = [i[1] for i in Res]
for i in range(0, len(Res_gpr_rxn_withoutnan_without_emptygpr)):
    res_relationship = []
    a = Res_gpr_rxn_withoutnan_without_emptygpr[i][2]
    b = a.split(' or ')
    for ii in range(0, len(b)):
        c = b[ii].split(' and ')
        for iii in range(0, len(c)):
            c[iii] = c[iii].replace('(','')
            c[iii] = c[iii].replace(')', '')
            r_ = Res_annotation_gene_list.index(c[iii])
            res_relationship.append([Res[r_][1],Res[r_][0]])
    for ii in range(0, len(res_relationship)):
        Res_gpr_rxn_withoutnan_without_emptygpr[i][2] = Res_gpr_rxn_withoutnan_without_emptygpr[i][2].replace(res_relationship[ii][0],res_relationship[ii][1])

# 再进行相应的打分，要是OR的关机就加分呗，要是AND的关系就最低得分：

Res_annotation_protein_list = [i[0] for i in Res]
for i in range(0, len(Res_gpr_rxn_withoutnan_without_emptygpr)):
    res_score = []
    a = Res_gpr_rxn_withoutnan_without_emptygpr[i][2]
    b = a.split(' or ')
    for ii in range(0, len(b)):
        _score = []
        c = b[ii].split(' and ')
        for iii in range(0, len(c)):
            c[iii] = c[iii].replace('(','')
            c[iii] = c[iii].replace(')', '')
            r_ = Res_annotation_protein_list.index(c[iii])
            _score.append(Res[r_][11])
        res_score.append(min(_score))
    res_score_ = max(res_score)
    Res_gpr_rxn_withoutnan_without_emptygpr[i].append(res_score_)

df_ = pd.DataFrame(Res_gpr_rxn_withoutnan_without_emptygpr, columns=['reaction','Pan_Fungi_GPR','Protein','Score'])
df_.to_excel("Yarrowia_lipolytica_model.xlsx", index=False)